Sharma, Rahul; Narayanan, Shrikanth
Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection Journal Article
In: IEEE Open Journal of Signal Processing , pp. 225-232, 2023.
Abstract | Links | BibTeX | Tags: active speaker localization, computational media understanding, cross-modal learning, multimedia understanding
@article{nokey,
title = {Audio-Visual Activity Guided Cross-Modal Identity Association for Active Speaker Detection},
author = {Rahul Sharma and Shrikanth Narayanan},
doi = {10.1109/OJSP.2023.3267269},
year = {2023},
date = {2023-04-14},
urldate = {2023-04-14},
journal = {IEEE Open Journal of Signal Processing },
pages = {225-232},
abstract = {Active speaker detection in videos addresses associating a source face, visible in the video frames, with the underlying speech in the audio modality. The two primary sources of information to derive such a speech-face relationship are i) visual activity and its interaction with the speech signal and ii) co-occurrences of speakers' identities across modalities in the form of face and speech. The two approaches have their limitations: the audio-visual activity models get confused with other frequently occurring vocal activities, such as laughing and chewing, while the speakers' identity-based methods are limited to videos having enough disambiguating information to establish a speech-face association. Since the two approaches are independent, we investigate their complementary nature in this work. We propose a novel unsupervised framework to guide the speakers' cross-modal identity association with the audio-visual activity for active speaker detection. Through experiments on entertainment media videos from two benchmark datasets–the AVA active speaker (movies) and Visual Person Clustering Dataset (TV shows)–we show that a simple late fusion of the two approaches enhances the active speaker detection performance.},
keywords = {active speaker localization, computational media understanding, cross-modal learning, multimedia understanding},
pubstate = {published},
tppubtype = {article}
}
Sharma, Rahul; Somandepalli, Krishna; Narayanan, Shrikanth
Cross modal video representations for weakly supervised active speaker localization Journal Article
In: IEEE Transactions on Multimedia, Early Access , pp. 1-12, 2022.
Abstract | Links | BibTeX | Tags: active speaker localization, cross-modal learning, multiple instance learning, weakly supervised learning
@article{Sharma2022,
title = {Cross modal video representations for weakly supervised active speaker localization},
author = {Rahul Sharma and Krishna Somandepalli and Shrikanth Narayanan},
url = {https://ieeexplore.ieee.org/document/9991097},
doi = {10.1109/TMM.2022.3229975},
year = {2022},
date = {2022-12-16},
urldate = {2022-12-16},
journal = {IEEE Transactions on Multimedia},
volume = {Early Access},
pages = {1-12},
abstract = {An objective understanding of media depictions, such as inclusive portrayals of how much someone is heard and seen on screen such as in film and television, requires the machines to discern automatically who, when, how, and where someone is talking, and not. Speaker activity can be automatically discerned from the rich multimodal information present in the media content. This is however a challenging problem due to the vast variety and contextual variability in media content, and the lack of labeled data. In this work, we present a cross-modal neural network for learning visual representations, which have implicit information pertaining to the spatial location of a speaker in the visual frames. Avoiding the need for manual annotations for active speakers in visual frames, acquiring of which is very expensive, we present a weakly supervised system for the task of localizing active speakers in movie content. We use the learned cross-modal visual representations, and provide weak supervision from movie subtitles acting as a proxy for voice activity, thus requiring no manual annotations. Furthermore, we propose an audio-assisted post-processing formulation for the task of active speaker detection. We evaluate the performance of the proposed system on three benchmark datasets: i) AVA active speaker dataset, ii) Visual person clustering dataset, and iii) Columbia datset, and demonstrate the effectiveness of the cross-modal embeddings for localizing active speakers in comparison to fully supervised systems.},
keywords = {active speaker localization, cross-modal learning, multiple instance learning, weakly supervised learning},
pubstate = {published},
tppubtype = {article}
}