Greer, Timothy; Shi, Xuan; Ma, Benjamin; Narayanan, Shrikanth
Creating musical features using multi-faceted, multi-task encoders based on transformers Journal Article
In: Scientific Reports, 13 (1), pp. 10713, 2023.
Abstract | BibTeX | Tags: autoencoders, music representations, self-supervision
@article{greer2023creating,
title = {Creating musical features using multi-faceted, multi-task encoders based on transformers},
author = {Timothy Greer and Xuan Shi and Benjamin Ma and Shrikanth Narayanan},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Scientific Reports},
volume = {13},
number = {1},
pages = {10713},
publisher = {Nature Publishing Group UK London},
abstract = {Computational machine intelligence approaches have enabled a variety of music-centric technologies in support of creating, sharing and interacting with music content. A strong performance on specific downstream application tasks, such as music genre detection and music emotion recognition, is paramount to ensuring broad capabilities for computational music understanding and Music Information Retrieval. Traditional approaches have relied on supervised learning to train models to support these music-related tasks. However, such approaches require copious annotated data and still may only provide insight into one view of music—namely, that related to the specific task at hand. We present a new model for generating audio-musical features that support music understanding, leveraging self-supervision and cross-domain learning. After pre-training using masked reconstruction of musical input features using self-attention bidirectional transformers, output representations are fine-tuned using several downstream music understanding tasks. Results show that the features generated by our multi-faceted, multi-task, music transformer model, which we call M3BERT, tend to outperform other audio and music embeddings on several diverse music-related tasks, indicating the potential of self-supervised and semi-supervised learning approaches toward a more generalized and robust computational approach to modeling music. Our work can offer a starting point for many music-related modeling tasks, with potential applications in learning deep representations and enabling robust technology applications.},
keywords = {autoencoders, music representations, self-supervision},
pubstate = {published},
tppubtype = {article}
}
Somandepalli, Krishna; Martinez, Victor; Kumar, Naveen; Narayanan, Shrikanth
Multimodal Representation of Advertisements Using Segment-Level Autoencoders Inproceedings
In: Proceedings of the 20th ACM International Conference on Multimodal Interaction, pp. 418–422, Association for Computing Machinery, Boulder, CO, USA, 2018, ISBN: 9781450356923.
Abstract | Links | BibTeX | Tags: advertisements, autoencoders, multimodal joint representation
@inproceedings{10.1145/3242969.3243026,
title = {Multimodal Representation of Advertisements Using Segment-Level Autoencoders},
author = {Krishna Somandepalli and Victor Martinez and Naveen Kumar and Shrikanth Narayanan},
url = {https://doi.org/10.1145/3242969.3243026},
doi = {10.1145/3242969.3243026},
isbn = {9781450356923},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 20th ACM International Conference on Multimodal Interaction},
pages = {418–422},
publisher = {Association for Computing Machinery},
address = {Boulder, CO, USA},
series = {ICMI '18},
abstract = {Automatic analysis of advertisements (ads) poses an interesting problem for learning
multimodal representations. A promising direction of research is the development of
deep neural network autoencoders to obtain inter-modal and intra-modal representations.
In this work, we propose a system to obtain segment-level unimodal and joint representations.
These features are concatenated, and then averaged across the duration of an ad to
obtain a single multimodal representation. The autoencoders are trained using segments
generated by time-aligning frames between the audio and video modalities with forward
and backward context. In order to assess the multimodal representations, we consider
the tasks of classifying an ad as funny or exciting in a publicly available dataset
of 2,720 ads. For this purpose we train the segment-level autoencoders on a larger,
unlabeled dataset of 9,740 ads, agnostic of the test set. Our experiments show that:
1) the multimodal representations outperform joint and unimodal representations, 2)
the different representations we learn are complementary to each other, and 3) the
segment-level multimodal representations perform better than classical autoencoders
and cross-modal representations -- within the context of the two classification tasks.
We obtain an improvement of about 5% in classification accuracy compared to a competitive
baseline.},
keywords = {advertisements, autoencoders, multimodal joint representation},
pubstate = {published},
tppubtype = {inproceedings}
}