Bose, Digbalay; Hebbar, Rajat; Feng, Tiantian; Somandepalli, Krishna; Xu, Anfeng; Narayanan, Shrikanth
MM-AU: Towards Multimodal Understanding of Advertisement Videos Conference
ACM Multimedia , 2023.
BibTeX | Tags: advertisements, computational media understanding, content analysis, multimedia understanding, multimodal
@conference{nokey,
title = {MM-AU: Towards Multimodal Understanding of Advertisement Videos},
author = {Digbalay Bose and Rajat Hebbar and Tiantian Feng and Krishna Somandepalli and Anfeng Xu and Shrikanth Narayanan },
year = {2023},
date = {2023-10-29},
urldate = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {advertisements, computational media understanding, content analysis, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Somandepalli, Krishna; Martinez, Victor; Kumar, Naveen; Narayanan, Shrikanth
Multimodal Representation of Advertisements Using Segment-Level Autoencoders Inproceedings
In: Proceedings of the 20th ACM International Conference on Multimodal Interaction, pp. 418–422, Association for Computing Machinery, Boulder, CO, USA, 2018, ISBN: 9781450356923.
Abstract | Links | BibTeX | Tags: advertisements, autoencoders, multimodal joint representation
@inproceedings{10.1145/3242969.3243026,
title = {Multimodal Representation of Advertisements Using Segment-Level Autoencoders},
author = {Krishna Somandepalli and Victor Martinez and Naveen Kumar and Shrikanth Narayanan},
url = {https://doi.org/10.1145/3242969.3243026},
doi = {10.1145/3242969.3243026},
isbn = {9781450356923},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 20th ACM International Conference on Multimodal Interaction},
pages = {418–422},
publisher = {Association for Computing Machinery},
address = {Boulder, CO, USA},
series = {ICMI '18},
abstract = {Automatic analysis of advertisements (ads) poses an interesting problem for learning
multimodal representations. A promising direction of research is the development of
deep neural network autoencoders to obtain inter-modal and intra-modal representations.
In this work, we propose a system to obtain segment-level unimodal and joint representations.
These features are concatenated, and then averaged across the duration of an ad to
obtain a single multimodal representation. The autoencoders are trained using segments
generated by time-aligning frames between the audio and video modalities with forward
and backward context. In order to assess the multimodal representations, we consider
the tasks of classifying an ad as funny or exciting in a publicly available dataset
of 2,720 ads. For this purpose we train the segment-level autoencoders on a larger,
unlabeled dataset of 9,740 ads, agnostic of the test set. Our experiments show that:
1) the multimodal representations outperform joint and unimodal representations, 2)
the different representations we learn are complementary to each other, and 3) the
segment-level multimodal representations perform better than classical autoencoders
and cross-modal representations -- within the context of the two classification tasks.
We obtain an improvement of about 5% in classification accuracy compared to a competitive
baseline.},
keywords = {advertisements, autoencoders, multimodal joint representation},
pubstate = {published},
tppubtype = {inproceedings}
}