@inproceedings{vella-etal-2024-towards-corpus,
title = "Towards a Corpus of Spoken {M}altese: Korpus tal-Malti Mitkellem, {KMM}",
author = "Vella, Alexandra (Sandra) and
Agius, Sarah and
Williams, Aiden and
Borg, Claudia",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1420",
pages = "16343--16352",
abstract = "This paper presents the rationale for a {``}dedicated{''} corpus of spoken Maltese, Korpus tal-Malti Mitkellem, KMM, {`}Corpus of Spoken Maltese{'}, based on the concept of a gold-standard Core collection. The Core collection is designed to cater to as wide a variety of user needs as possible whilst respecting basic principles governing corpus design, such as representativeness and balance, and delivering high quality in terms of both audio quality and annotations. An overview is provided of the composition of the current Core corpus of around 20 hours of data and of the human annotation effort involved. We also carry out a small qualitative analysis of the output of a Maltese ASR system and compare it to the human annotators{'} output. Initial results are promising, showing that the ASR is robust enough to generate first-pass texts for annotators to work on, thus reducing the human effort, and consequently, the cost involved.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vella-etal-2024-towards-corpus">
<titleInfo>
<title>Towards a Corpus of Spoken Maltese: Korpus tal-Malti Mitkellem, KMM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="given">(Sandra)</namePart>
<namePart type="family">Vella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Agius</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiden</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Borg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the rationale for a “dedicated” corpus of spoken Maltese, Korpus tal-Malti Mitkellem, KMM, ‘Corpus of Spoken Maltese’, based on the concept of a gold-standard Core collection. The Core collection is designed to cater to as wide a variety of user needs as possible whilst respecting basic principles governing corpus design, such as representativeness and balance, and delivering high quality in terms of both audio quality and annotations. An overview is provided of the composition of the current Core corpus of around 20 hours of data and of the human annotation effort involved. We also carry out a small qualitative analysis of the output of a Maltese ASR system and compare it to the human annotators’ output. Initial results are promising, showing that the ASR is robust enough to generate first-pass texts for annotators to work on, thus reducing the human effort, and consequently, the cost involved.</abstract>
<identifier type="citekey">vella-etal-2024-towards-corpus</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1420</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>16343</start>
<end>16352</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards a Corpus of Spoken Maltese: Korpus tal-Malti Mitkellem, KMM
%A Vella, Alexandra (Sandra)
%A Agius, Sarah
%A Williams, Aiden
%A Borg, Claudia
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F vella-etal-2024-towards-corpus
%X This paper presents the rationale for a “dedicated” corpus of spoken Maltese, Korpus tal-Malti Mitkellem, KMM, ‘Corpus of Spoken Maltese’, based on the concept of a gold-standard Core collection. The Core collection is designed to cater to as wide a variety of user needs as possible whilst respecting basic principles governing corpus design, such as representativeness and balance, and delivering high quality in terms of both audio quality and annotations. An overview is provided of the composition of the current Core corpus of around 20 hours of data and of the human annotation effort involved. We also carry out a small qualitative analysis of the output of a Maltese ASR system and compare it to the human annotators’ output. Initial results are promising, showing that the ASR is robust enough to generate first-pass texts for annotators to work on, thus reducing the human effort, and consequently, the cost involved.
%U https://aclanthology.org/2024.lrec-main.1420
%P 16343-16352
Markdown (Informal)
[Towards a Corpus of Spoken Maltese: Korpus tal-Malti Mitkellem, KMM](https://aclanthology.org/2024.lrec-main.1420) (Vella et al., LREC-COLING 2024)
ACL
- Alexandra (Sandra) Vella, Sarah Agius, Aiden Williams, and Claudia Borg. 2024. Towards a Corpus of Spoken Maltese: Korpus tal-Malti Mitkellem, KMM. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 16343–16352, Torino, Italia. ELRA and ICCL.