Bose, Digbalay; Hebbar, Rajat; Feng, Tiantian; Somandepalli, Krishna; Xu, Anfeng; Narayanan, Shrikanth
MM-AU: Towards Multimodal Understanding of Advertisement Videos Conference
ACM Multimedia , 2023.
BibTeX | Tags: advertisements, computational media understanding, content analysis, multimedia understanding, multimodal
@conference{nokey,
title = {MM-AU: Towards Multimodal Understanding of Advertisement Videos},
author = {Digbalay Bose and Rajat Hebbar and Tiantian Feng and Krishna Somandepalli and Anfeng Xu and Shrikanth Narayanan },
year = {2023},
date = {2023-10-29},
urldate = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {advertisements, computational media understanding, content analysis, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Hebbar, Rajat; Bose, Digbalay; Narayanan, Shrikanth
SEAR: Semantically-grounded Audio Representations Conference
ACM Multimedia , 2023.
BibTeX | Tags: computational media understanding, multimodal, self-supervision
@conference{nokey,
title = {SEAR: Semantically-grounded Audio Representations},
author = {Rajat Hebbar and Digbalay Bose and Shrikanth Narayanan},
year = {2023},
date = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {computational media understanding, multimodal, self-supervision},
pubstate = {published},
tppubtype = {conference}
}
Bose, Digbalay; Hebbar, Rajat; Somandepalli, Krishna; Narayanan, Shrikanth
Contextually-rich human affect perception using multimodal scene information Conference
IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) , 2023.
Abstract | Links | BibTeX | Tags: emotion recognition, multimedia understanding, multimodal
@conference{bose-etal-2023-emotion-recognition,
title = {Contextually-rich human affect perception using multimodal scene information},
author = {Digbalay Bose and Rajat Hebbar and Krishna Somandepalli and Shrikanth Narayanan },
doi = {https://arxiv.org/abs/2303.06904},
year = {2023},
date = {2023-03-13},
urldate = {2023-03-13},
publisher = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) },
abstract = {The process of human affect understanding involves the ability to infer person specific emotional states from various sources including images, speech, and language. Affect perception from images has predominantly focused on expressions extracted from salient face crops. However, emotions perceived by humans rely on multiple contextual cues including social settings, foreground interactions, and ambient visual scenes. In this work, we leverage pretrained vision-language (VLN) models to extract descriptions of foreground context from images. Further, we propose a multimodal context fusion (MCF) module to combine foreground cues with the visual scene and person-based contextual information for emotion prediction. We show the effectiveness of our proposed modular design on two datasets associated with natural scenes and TV shows.},
keywords = {emotion recognition, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Avramidis, Kleanthis; Stewart, Shanti; Narayanan, Shrikanth
On the Role of Visual Context in Enriching Music Representations Conference
IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) , 2023.
Abstract | Links | BibTeX | Tags: multimedia understanding, multimodal, music representations
@conference{avramidi-etal-vcmr,
title = {On the Role of Visual Context in Enriching Music Representations},
author = {Kleanthis Avramidis and Shanti Stewart and Shrikanth Narayanan},
doi = {https://arxiv.org/abs/2210.15828},
year = {2023},
date = {2023-02-15},
urldate = {2023-02-15},
publisher = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) },
abstract = {Human perception and experience of music is highly context-dependent. Contextual variability contributes to differences in how we interpret and interact with music, challenging the design of robust models for information retrieval. Incorporating multimodal context from diverse sources provides a promising approach toward modeling this variability. Music presented in media such as movies and music videos provide rich multimodal context that modulates underlying human experiences. However, such context modeling is underexplored, as it requires large amounts of multimodal data along with relevant annotations. Self-supervised learning can help address these challenges by automatically extracting rich, high-level correspondences between different modalities, hence alleviating the need for fine-grained annotations at scale. In this study, we propose VCMR -- Video-Conditioned Music Representations, a contrastive learning framework that learns music representations from audio and the accompanying music videos. The contextual visual information enhances representations of music audio, as evaluated on the downstream task of music tagging. Experimental results show that the proposed framework can contribute additive robustness to audio representations and indicates to what extent musical elements are affected or determined by visual context.},
keywords = {multimedia understanding, multimodal, music representations},
pubstate = {published},
tppubtype = {conference}
}
Hebbar, Rajat; Bose, Digbalay; Somandepalli, Krishna; Vijai, Veena; Narayanan, Shrikanth
A dataset for Audio-Visual Sound Event Detection in Movies Conference
IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) , 2023.
Abstract | Links | BibTeX | Tags: audio-visual event detection, multimodal
@conference{hebbar-2023-audio-events,
title = {A dataset for Audio-Visual Sound Event Detection in Movies},
author = {Rajat Hebbar and Digbalay Bose and Krishna Somandepalli and Veena Vijai and Shrikanth Narayanan},
doi = {https://arxiv.org/abs/2302.07315},
year = {2023},
date = {2023-02-14},
urldate = {2023-02-14},
publisher = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) },
abstract = {Audio event detection is a widely studied audio processing task, with applications ranging from self-driving cars to healthcare. In-the-wild datasets such as Audioset have propelled research in this field. However, many efforts typically involve manual annotation and verification, which is expensive to perform at scale. Movies depict various real-life and fictional scenarios which makes them a rich resource for mining a wide-range of audio events. In this work, we present a dataset of audio events called Subtitle-Aligned Movie Sounds (SAM-S). We use publicly-available closed-caption transcripts to automatically mine over 110K audio events from 430 movies. We identify three dimensions to categorize audio events: sound, source, quality, and present the steps involved to produce a final taxonomy of 245 sounds. We discuss the choices involved in generating the taxonomy, and also highlight the human-centered nature of sounds in our dataset. We establish a baseline performance for audio-only sound classification of 34.76% mean average precision and show that incorporating visual information can further improve the performance by about 5%.},
keywords = {audio-visual event detection, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Guha, Tanaya; Huang, Che-Wei; Kumar, Naveen; Zhu, Yan; Narayanan, Shrikanth S
Gender Representation in Cinematic Content: A Multimodal Approach Inproceedings
In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, pp. 31–34, Association for Computing Machinery, Seattle, Washington, USA, 2015, ISBN: 9781450339124.
Abstract | Links | BibTeX | Tags: content analysis, gender representation, movie, multimodal
@inproceedings{10.1145/2818346.2820778,
title = {Gender Representation in Cinematic Content: A Multimodal Approach},
author = {Tanaya Guha and Che-Wei Huang and Naveen Kumar and Yan Zhu and Shrikanth S Narayanan},
url = {https://doi.org/10.1145/2818346.2820778},
doi = {10.1145/2818346.2820778},
isbn = {9781450339124},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 2015 ACM on International Conference on Multimodal Interaction},
pages = {31–34},
publisher = {Association for Computing Machinery},
address = {Seattle, Washington, USA},
series = {ICMI '15},
abstract = {The goal of this paper is to enable an objective understanding of gender portrayals
in popular films and media through multimodal content analysis. An automated system
for analyzing gender representation in terms of screen presence and speaking time
is developed. First, we perform independent processing of the video and the audio
content to estimate gender distribution of screen presence at shot level, and of speech
at utterance level. A measure of the movie's excitement or intensity is computed using
audiovisual features for every scene. This measure is used as a weighting function
to combine the gender-based screen/speaking time information at shot/utterance level
to compute gender representation for the entire movie. Detailed results and analyses
are presented on seventeen full length Hollywood movies.},
keywords = {content analysis, gender representation, movie, multimodal},
pubstate = {published},
tppubtype = {inproceedings}
}