@inproceedings{ye-etal-2024-mmad-multi,
title = "{MMAD}:Multi-modal Movie Audio Description",
author = "Ye, Xiaojun and
Chen, Junhao and
Li, Xiang and
Xin, Haidong and
Li, Chao and
Zhou, Sheng and
Bu, Jiajun",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.998",
pages = "11415--11428",
abstract = "Audio Description (AD) aims to generate narrations of information that is not accessible through unimodal hearing in movies to aid the visually impaired in following film narratives. Current solutions rely heavily on manual work, resulting in high costs and limited scalability. While automatic methods have been introduced, they often yield descriptions that are sparse and omit key details. ddressing these challenges, we propose a novel automated pipeline, the Multi-modal Movie Audio Description (MMAD). MMAD harnesses the capabilities of three key modules as well as the power of Llama2 to augment the depth and breadth of the generated descriptions. Specifically, first, we propose an Audio-aware Feature Enhancing Module to provide the model with multi-modal perception capabilities, enriching the background descriptions with a more comprehensive understanding of the environmental features. Second, we propose an Actor-tracking-aware Story Linking Module to aid in the generation of contextual and character-centric descriptions, thereby enhancing the richness of character depictions. Third, we incorporate a Subtitled Movie Clip Contextual Alignment Module, supplying semantic information about various time periods throughout the movie, which facilitates the consideration of the full movie narrative context when describing silent segments, thereby enhancing the richness of the descriptions. Experiments on widely used datasets convincingly demonstrates that MMAD significantly surpasses existing strong baselines in performance, establishing a new state-of-the-art in the field. Our code will be released at https://github.com/Daria8976/MMAD.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2024-mmad-multi">
<titleInfo>
<title>MMAD:Multi-modal Movie Audio Description</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haidong</namePart>
<namePart type="family">Xin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Bu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Audio Description (AD) aims to generate narrations of information that is not accessible through unimodal hearing in movies to aid the visually impaired in following film narratives. Current solutions rely heavily on manual work, resulting in high costs and limited scalability. While automatic methods have been introduced, they often yield descriptions that are sparse and omit key details. ddressing these challenges, we propose a novel automated pipeline, the Multi-modal Movie Audio Description (MMAD). MMAD harnesses the capabilities of three key modules as well as the power of Llama2 to augment the depth and breadth of the generated descriptions. Specifically, first, we propose an Audio-aware Feature Enhancing Module to provide the model with multi-modal perception capabilities, enriching the background descriptions with a more comprehensive understanding of the environmental features. Second, we propose an Actor-tracking-aware Story Linking Module to aid in the generation of contextual and character-centric descriptions, thereby enhancing the richness of character depictions. Third, we incorporate a Subtitled Movie Clip Contextual Alignment Module, supplying semantic information about various time periods throughout the movie, which facilitates the consideration of the full movie narrative context when describing silent segments, thereby enhancing the richness of the descriptions. Experiments on widely used datasets convincingly demonstrates that MMAD significantly surpasses existing strong baselines in performance, establishing a new state-of-the-art in the field. Our code will be released at https://github.com/Daria8976/MMAD.</abstract>
<identifier type="citekey">ye-etal-2024-mmad-multi</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.998</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11415</start>
<end>11428</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MMAD:Multi-modal Movie Audio Description
%A Ye, Xiaojun
%A Chen, Junhao
%A Li, Xiang
%A Xin, Haidong
%A Li, Chao
%A Zhou, Sheng
%A Bu, Jiajun
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ye-etal-2024-mmad-multi
%X Audio Description (AD) aims to generate narrations of information that is not accessible through unimodal hearing in movies to aid the visually impaired in following film narratives. Current solutions rely heavily on manual work, resulting in high costs and limited scalability. While automatic methods have been introduced, they often yield descriptions that are sparse and omit key details. ddressing these challenges, we propose a novel automated pipeline, the Multi-modal Movie Audio Description (MMAD). MMAD harnesses the capabilities of three key modules as well as the power of Llama2 to augment the depth and breadth of the generated descriptions. Specifically, first, we propose an Audio-aware Feature Enhancing Module to provide the model with multi-modal perception capabilities, enriching the background descriptions with a more comprehensive understanding of the environmental features. Second, we propose an Actor-tracking-aware Story Linking Module to aid in the generation of contextual and character-centric descriptions, thereby enhancing the richness of character depictions. Third, we incorporate a Subtitled Movie Clip Contextual Alignment Module, supplying semantic information about various time periods throughout the movie, which facilitates the consideration of the full movie narrative context when describing silent segments, thereby enhancing the richness of the descriptions. Experiments on widely used datasets convincingly demonstrates that MMAD significantly surpasses existing strong baselines in performance, establishing a new state-of-the-art in the field. Our code will be released at https://github.com/Daria8976/MMAD.
%U https://aclanthology.org/2024.lrec-main.998
%P 11415-11428
Markdown (Informal)
[MMAD:Multi-modal Movie Audio Description](https://aclanthology.org/2024.lrec-main.998) (Ye et al., LREC-COLING 2024)
ACL
- Xiaojun Ye, Junhao Chen, Xiang Li, Haidong Xin, Chao Li, Sheng Zhou, and Jiajun Bu. 2024. MMAD:Multi-modal Movie Audio Description. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 11415–11428, Torino, Italia. ELRA and ICCL.