Hebbar, Rajat; Bose, Digbalay; Narayanan, Shrikanth
SEAR: Semantically-grounded Audio Representations Conference
ACM Multimedia , 2023.
BibTeX | Tags: computational media understanding, multimodal, self-supervision
@conference{nokey,
title = {SEAR: Semantically-grounded Audio Representations},
author = {Rajat Hebbar and Digbalay Bose and Shrikanth Narayanan},
year = {2023},
date = {2023-10-29},
publisher = {ACM Multimedia },
keywords = {computational media understanding, multimodal, self-supervision},
pubstate = {published},
tppubtype = {conference}
}
Greer, Timothy; Shi, Xuan; Ma, Benjamin; Narayanan, Shrikanth
Creating musical features using multi-faceted, multi-task encoders based on transformers Journal Article
In: Scientific Reports, 13 (1), pp. 10713, 2023.
Abstract | BibTeX | Tags: autoencoders, music representations, self-supervision
@article{greer2023creating,
title = {Creating musical features using multi-faceted, multi-task encoders based on transformers},
author = {Timothy Greer and Xuan Shi and Benjamin Ma and Shrikanth Narayanan},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Scientific Reports},
volume = {13},
number = {1},
pages = {10713},
publisher = {Nature Publishing Group UK London},
abstract = {Computational machine intelligence approaches have enabled a variety of music-centric technologies in support of creating, sharing and interacting with music content. A strong performance on specific downstream application tasks, such as music genre detection and music emotion recognition, is paramount to ensuring broad capabilities for computational music understanding and Music Information Retrieval. Traditional approaches have relied on supervised learning to train models to support these music-related tasks. However, such approaches require copious annotated data and still may only provide insight into one view of music—namely, that related to the specific task at hand. We present a new model for generating audio-musical features that support music understanding, leveraging self-supervision and cross-domain learning. After pre-training using masked reconstruction of musical input features using self-attention bidirectional transformers, output representations are fine-tuned using several downstream music understanding tasks. Results show that the features generated by our multi-faceted, multi-task, music transformer model, which we call M3BERT, tend to outperform other audio and music embeddings on several diverse music-related tasks, indicating the potential of self-supervised and semi-supervised learning approaches toward a more generalized and robust computational approach to modeling music. Our work can offer a starting point for many music-related modeling tasks, with potential applications in learning deep representations and enabling robust technology applications.},
keywords = {autoencoders, music representations, self-supervision},
pubstate = {published},
tppubtype = {article}
}
Somandepalli, Krishna; Hebbar, Rajat; Narayanan, Shrikanth
Robust Character Labeling in Movie Videos: Data Resources and Self-supervised Feature Adaptation. Journal Article
In: IEEE Transactions on Multimedia, 24 , pp. 3355 - 3368, 2021.
Abstract | Links | BibTeX | Tags: computational media understanding, face clustering, face diarization, multiview correlation, self-supervision, triplet loss, video character labeling
@article{Somandepalli2021b,
title = {Robust Character Labeling in Movie Videos: Data Resources and Self-supervised Feature Adaptation.},
author = {Krishna Somandepalli and Rajat Hebbar and Shrikanth Narayanan},
url = {https://sail.usc.edu/publications/files/Somandepalli-TMM2021.pdf},
doi = {10.1109/TMM.2021.3096155},
year = {2021},
date = {2021-07-09},
urldate = {2021-07-09},
journal = {IEEE Transactions on Multimedia},
volume = {24},
pages = {3355 - 3368},
abstract = {Robust face clustering is a vital step in enabling computational understanding of visual character portrayal in media. Face clustering for long-form content is challenging because of variations in appearance and lack of supporting large-scale labeled data. Our work in this paper focuses on two key aspects of this problem: the lack of domain-specific training or benchmark datasets, and adapting face embeddings learned on web images to long-form content, specifically movies. First, we present a dataset of over 169000 face tracks curated from 240 Hollywood movies with weak labels on whether a pair of face tracks belong to the same or a different character. We propose an offline algorithm based on nearest-neighbor search in the embedding space to mine hard-examples from these tracks. We then investigate triplet-loss and multiview correlation-based methods for adapting face embeddings to hard-examples. Our experimental results highlight the usefulness of weakly labeled data for domain-specific feature adaptation. Overall, we find that multiview correlation-based adaptation yields more discriminative and robust face embeddings. Its performance on downstream face verification and clustering tasks is comparable to that of the state-of-the-art results in this domain. We also present the SAIL-Movie Character Benchmark corpus developed to augment existing benchmarks. It consists of racially diverse actors and provides face-quality labels for subsequent error analysis. We hope that the large-scale datasets developed in this work can further advance automatic character labeling in videos. All resources are available freely at https://sail.usc.edu/~ccmi/multiface .
},
keywords = {computational media understanding, face clustering, face diarization, multiview correlation, self-supervision, triplet loss, video character labeling},
pubstate = {published},
tppubtype = {article}
}