Bose, Digbalay; Hebbar, Rajat; Feng, Tiantian; Somandepalli, Krishna; Xu, Anfeng; Narayanan, Shrikanth
MM-AU: Towards Multimodal Understanding of Advertisement Videos Conference
ACM Multimedia , 2023.
BibTeX | Tags: advertisements, computational media understanding, content analysis, multimedia understanding, multimodal
@conference{nokey,
title = {MM-AU: Towards Multimodal Understanding of Advertisement Videos},
author = {Digbalay Bose and Rajat Hebbar and Tiantian Feng and Krishna Somandepalli and Anfeng Xu and Shrikanth Narayanan },
year = {2023},
date = {2023-10-29},
urldate = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {advertisements, computational media understanding, content analysis, multimedia understanding, multimodal},
pubstate = {published},
tppubtype = {conference}
}
Hebbar, Rajat; Bose, Digbalay; Narayanan, Shrikanth
SEAR: Semantically-grounded Audio Representations Conference
ACM Multimedia , 2023.
BibTeX | Tags: computational media understanding, multimodal, self-supervision
@conference{nokey,
title = {SEAR: Semantically-grounded Audio Representations},
author = {Rajat Hebbar and Digbalay Bose and Shrikanth Narayanan},
year = {2023},
date = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {computational media understanding, multimodal, self-supervision},
pubstate = {published},
tppubtype = {conference}
}
Sharma, Rahul; Narayanan, Shrikanth
Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection Journal Article
In: IEEE Open Journal of Signal Processing , pp. 225-232, 2023.
Abstract | Links | BibTeX | Tags: active speaker localization, computational media understanding, cross-modal learning, multimedia understanding
@article{nokey,
title = {Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection},
author = {Rahul Sharma and Shrikanth Narayanan},
doi = {10.1109/OJSP.2023.3267269},
year = {2023},
date = {2023-04-14},
urldate = {2023-04-14},
journal = {IEEE Open Journal of Signal Processing },
pages = {225-232},
abstract = {Active speaker detection in videos addresses associating a source face, visible in the video frames, with the underlying speech in the audio modality. The two primary sources of information to derive such a speech-face relationship are i) visual activity and its interaction with the speech signal and ii) co-occurrences of speakers' identities across modalities in the form of face and speech. The two approaches have their limitations: the audio-visual activity models get confused with other frequently occurring vocal activities, such as laughing and chewing, while the speakers' identity-based methods are limited to videos having enough disambiguating information to establish a speech-face association. Since the two approaches are independent, we investigate their complementary nature in this work. We propose a novel unsupervised framework to guide the speakers' cross-modal identity association with the audio-visual activity for active speaker detection. Through experiments on entertainment media videos from two benchmark datasets–the AVA active speaker (movies) and Visual Person Clustering Dataset (TV shows)–we show that a simple late fusion of the two approaches enhances the active speaker detection performance.},
keywords = {active speaker localization, computational media understanding, cross-modal learning, multimedia understanding},
pubstate = {published},
tppubtype = {article}
}
Somandepalli, Krishna; Hebbar, Rajat; Narayanan, Shrikanth
Robust Character Labeling in Movie Videos: Data Resources and Self-supervised Feature Adaptation. Journal Article
In: IEEE Transactions on Multimedia, 24 , pp. 3355 - 3368, 2021.
Abstract | Links | BibTeX | Tags: computational media understanding, face clustering, face diarization, multiview correlation, self-supervision, triplet loss, video character labeling
@article{Somandepalli2021b,
title = {Robust Character Labeling in Movie Videos: Data Resources and Self-supervised Feature Adaptation.},
author = {Krishna Somandepalli and Rajat Hebbar and Shrikanth Narayanan},
url = {https://sail.usc.edu/publications/files/Somandepalli-TMM2021.pdf},
doi = {10.1109/TMM.2021.3096155},
year = {2021},
date = {2021-07-09},
urldate = {2021-07-09},
journal = {IEEE Transactions on Multimedia},
volume = {24},
pages = {3355 - 3368},
abstract = {Robust face clustering is a vital step in enabling computational understanding of visual character portrayal in media. Face clustering for long-form content is challenging because of variations in appearance and lack of supporting large-scale labeled data. Our work in this paper focuses on two key aspects of this problem: the lack of domain-specific training or benchmark datasets, and adapting face embeddings learned on web images to long-form content, specifically movies. First, we present a dataset of over 169000 face tracks curated from 240 Hollywood movies with weak labels on whether a pair of face tracks belong to the same or a different character. We propose an offline algorithm based on nearest-neighbor search in the embedding space to mine hard-examples from these tracks. We then investigate triplet-loss and multiview correlation-based methods for adapting face embeddings to hard-examples. Our experimental results highlight the usefulness of weakly labeled data for domain-specific feature adaptation. Overall, we find that multiview correlation-based adaptation yields more discriminative and robust face embeddings. Its performance on downstream face verification and clustering tasks is comparable to that of the state-of-the-art results in this domain. We also present the SAIL-Movie Character Benchmark corpus developed to augment existing benchmarks. It consists of racially diverse actors and provides face-quality labels for subsequent error analysis. We hope that the large-scale datasets developed in this work can further advance automatic character labeling in videos. All resources are available freely at https://sail.usc.edu/~ccmi/multiface .
},
keywords = {computational media understanding, face clustering, face diarization, multiview correlation, self-supervision, triplet loss, video character labeling},
pubstate = {published},
tppubtype = {article}
}