@inproceedings{1049419a2c2040558eea92be5853f8a1,
title = "MAAS: Multi-modal Assignation for Active Speaker Detection",
abstract = "Active speaker detection requires a mindful integration of multi-modal cues. Current methods focus on modeling and fusing short-term audiovisual features for individual speakers, often at frame level. We present a novel approach to active speaker detection that directly addresses the multi-modal nature of the problem and provides a straightforward strategy, where independent visual features (speakers) in the scene are assigned to a previously detected speech event. Our experiments show that a small graph data structure built from local information can approximate an instantaneous audio-visual assignment problem. Moreover, the temporal extension of this initial graph achieves a new state-of-the-art performance on the AVA-ActiveSpeaker dataset with a mAP of 88.8%.",
author = "Alc{\'a}zar, {Juan Le{\'o}n} and Heilbron, {Fabian Caba} and Thabet, {Ali K.} and Bernard Ghanem",
note = "Funding Information: Acknowledgments. This work was supported by the King Abdullah University of Science and Technology (KAUST) Office of Sponsored Research through the Visual Computing Center (VCC) funding. Publisher Copyright: {\textcopyright} 2021 IEEE; 18th IEEE/CVF International Conference on Computer Vision, ICCV 2021 ; Conference date: 11-10-2021 Through 17-10-2021",
year = "2021",
doi = "10.1109/ICCV48922.2021.00033",
language = "English (US)",
series = "Proceedings of the IEEE International Conference on Computer Vision",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "265--274",
booktitle = "Proceedings - 2021 IEEE/CVF International Conference on Computer Vision, ICCV 2021",
address = "United States",
}