Bose, Digbalay; Hebbar, Rajat; Feng, Tiantian; Somandepalli, Krishna; Xu, Anfeng; Narayanan, Shrikanth
MM-AU: Towards Multimodal Understanding of Advertisement Videos Conference
ACM Multimedia , 2023.
BibTeX | Tags: advertisements, computational media understanding, content analysis, multimedia understanding, multimodal
@conference{nokey,
title = {MM-AU: Towards Multimodal Understanding of Advertisement Videos},
author = {Digbalay Bose and Rajat Hebbar and Tiantian Feng and Krishna Somandepalli and Anfeng Xu and Shrikanth Narayanan },
year = {2023},
date = {2023-10-29},
urldate = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {advertisements, computational media understanding, content analysis, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Sharma, Rahul; Narayanan, Shrikanth
Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection Journal Article
In: IEEE Open Journal of Signal Processing , pp. 225-232, 2023.
Abstract | Links | BibTeX | Tags: active speaker localization, computational media understanding, cross-modal learning, multimedia understanding
@article{nokey,
title = {Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection},
author = {Rahul Sharma and Shrikanth Narayanan},
doi = {10.1109/OJSP.2023.3267269},
year = {2023},
date = {2023-04-14},
urldate = {2023-04-14},
journal = {IEEE Open Journal of Signal Processing },
pages = {225-232},
abstract = {Active speaker detection in videos addresses associating a source face, visible in the video frames, with the underlying speech in the audio modality. The two primary sources of information to derive such a speech-face relationship are i) visual activity and its interaction with the speech signal and ii) co-occurrences of speakers' identities across modalities in the form of face and speech. The two approaches have their limitations: the audio-visual activity models get confused with other frequently occurring vocal activities, such as laughing and chewing, while the speakers' identity-based methods are limited to videos having enough disambiguating information to establish a speech-face association. Since the two approaches are independent, we investigate their complementary nature in this work. We propose a novel unsupervised framework to guide the speakers' cross-modal identity association with the audio-visual activity for active speaker detection. Through experiments on entertainment media videos from two benchmark datasets–the AVA active speaker (movies) and Visual Person Clustering Dataset (TV shows)–we show that a simple late fusion of the two approaches enhances the active speaker detection performance.},
keywords = {active speaker localization, computational media understanding, cross-modal learning, multimedia understanding},
pubstate = {published},
tppubtype = {article}
}
Bose, Digbalay; Hebbar, Rajat; Somandepalli, Krishna; Narayanan, Shrikanth
Contextually-rich human affect perception using multimodal scene information Conference
IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) , 2023.
Abstract | Links | BibTeX | Tags: emotion recognition, multimedia understanding, multimodal
@conference{bose-etal-2023-emotion-recognition,
title = {Contextually-rich human affect perception using multimodal scene information},
author = {Digbalay Bose and Rajat Hebbar and Krishna Somandepalli and Shrikanth Narayanan },
doi = {https://arxiv.org/abs/2303.06904},
year = {2023},
date = {2023-03-13},
urldate = {2023-03-13},
publisher = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) },
abstract = {The process of human affect understanding involves the ability to infer person specific emotional states from various sources including images, speech, and language. Affect perception from images has predominantly focused on expressions extracted from salient face crops. However, emotions perceived by humans rely on multiple contextual cues including social settings, foreground interactions, and ambient visual scenes. In this work, we leverage pretrained vision-language (VLN) models to extract descriptions of foreground context from images. Further, we propose a multimodal context fusion (MCF) module to combine foreground cues with the visual scene and person-based contextual information for emotion prediction. We show the effectiveness of our proposed modular design on two datasets associated with natural scenes and TV shows.},
keywords = {emotion recognition, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Avramidis, Kleanthis; Stewart, Shanti; Narayanan, Shrikanth
On the Role of Visual Context in Enriching Music Representations Conference
IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) , 2023.
Abstract | Links | BibTeX | Tags: multimedia understanding, multimodal, music representations
@conference{avramidi-etal-vcmr,
title = {On the Role of Visual Context in Enriching Music Representations},
author = {Kleanthis Avramidis and Shanti Stewart and Shrikanth Narayanan},
doi = {https://arxiv.org/abs/2210.15828},
year = {2023},
date = {2023-02-15},
urldate = {2023-02-15},
publisher = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) },
abstract = {Human perception and experience of music is highly context-dependent. Contextual variability contributes to differences in how we interpret and interact with music, challenging the design of robust models for information retrieval. Incorporating multimodal context from diverse sources provides a promising approach toward modeling this variability. Music presented in media such as movies and music videos provide rich multimodal context that modulates underlying human experiences. However, such context modeling is underexplored, as it requires large amounts of multimodal data along with relevant annotations. Self-supervised learning can help address these challenges by automatically extracting rich, high-level correspondences between different modalities, hence alleviating the need for fine-grained annotations at scale. In this study, we propose VCMR -- Video-Conditioned Music Representations, a contrastive learning framework that learns music representations from audio and the accompanying music videos. The contextual visual information enhances representations of music audio, as evaluated on the downstream task of music tagging. Experimental results show that the proposed framework can contribute additive robustness to audio representations and indicates to what extent musical elements are affected or determined by visual context.},
keywords = {multimedia understanding, multimodal, music representations},
pubstate = {published},
tppubtype = {conference}
}
Baruah, Sabyasachee; Narayanan, Shrikanth
Character Coreference Resolution in Movie Screenplays Inproceedings
In: Findings of the Association for Computational Linguistics: ACL 2023, pp. 10300–10313, 2023.
Abstract | BibTeX | Tags: content analysis, coreference resolution, multimedia understanding
@inproceedings{baruah2023character,
title = {Character Coreference Resolution in Movie Screenplays},
author = {Sabyasachee Baruah and Shrikanth Narayanan},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2023},
pages = {10300--10313},
abstract = {Movie screenplays have a distinct narrative structure. It segments the story into scenes containing interleaving descriptions of actions, locations, and character dialogues. A typical screenplay spans several scenes and can include long-range dependencies between characters and events. A holistic document-level understanding of the screenplay requires several natural language processing capabilities, such as parsing, character identification, coreference resolution, action recognition, summarization, and attribute discovery. In this work, we develop scalable and robust methods to extract the structural information and character coreference clusters from full-length movie screenplays. We curate two datasets for screenplay parsing and character coreference— MovieParse and MovieCoref, respectively. We build a robust screenplay parser to handle inconsistencies in screenplay formatting and
leverage the parsed output to link co-referring character mentions. Our coreference models can scale to long screenplay documents without drastically increasing their memory footprints.},
keywords = {content analysis, coreference resolution, multimedia understanding},
pubstate = {published},
tppubtype = {inproceedings}
}
Martinez, Victor; Somandepalli, Krishna; Narayanan, Shrikanth
Boys don’t cry (or kiss or dance): A computational linguistic lens into gendered actions in film Journal Article
In: PLoS One, 2022.
Abstract | BibTeX | Tags: gendered analysis, multimedia understanding, semantic role labeling
@article{Martinez2022,
title = {Boys don’t cry (or kiss or dance): A computational linguistic lens into gendered actions in film},
author = {Victor Martinez and Krishna Somandepalli and Shrikanth Narayanan},
year = {2022},
date = {2022-12-20},
urldate = {2022-12-20},
journal = {PLoS One},
abstract = { },
keywords = {gendered analysis, multimedia understanding, semantic role labeling},
pubstate = {published},
tppubtype = {article}
}