@inproceedings{cavalin-etal-2023-understanding,
title = "Understanding Native Language Identification for {B}razilian Indigenous Languages",
author = "Cavalin, Paulo and
Domingues, Pedro and
Nogima, Julio and
Pinhanez, Claudio",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Oncevay, Arturo and
Rice, Enora and
Rijhwani, Shruti and
Palmer, Alexis and
Kann, Katharina",
booktitle = "Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.americasnlp-1.3",
doi = "10.18653/v1/2023.americasnlp-1.3",
pages = "12--18",
abstract = "We investigate native language identification (LangID) for Brazilian Indigenous Languages (BILs), using the Bible as training data. Our research extends from previous work, by presenting two analyses on the generalization of Bible-based LangID in non-biblical data. First, with newly collected non-biblical datasets, we show that such a LangID can still provide quite reasonable accuracy in languages for which there are more established writing standards, such as Guarani Mbya and Kaigang, but there can be a quite drastic drop in accuracy depending on the language. Then, we applied the LangID on a large set of texts, about 13M sentences from the Portuguese Wikipedia, towards understanding the difficulty factors may come out of such task in practice. The main outcome is that the lack of handling other American indigenous languages can affect considerably the precision for BILs, suggesting the need of a joint effort with related languages from the Americas.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cavalin-etal-2023-understanding">
<titleInfo>
<title>Understanding Native Language Identification for Brazilian Indigenous Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paulo</namePart>
<namePart type="family">Cavalin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Domingues</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julio</namePart>
<namePart type="family">Nogima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Pinhanez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enora</namePart>
<namePart type="family">Rice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Kann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate native language identification (LangID) for Brazilian Indigenous Languages (BILs), using the Bible as training data. Our research extends from previous work, by presenting two analyses on the generalization of Bible-based LangID in non-biblical data. First, with newly collected non-biblical datasets, we show that such a LangID can still provide quite reasonable accuracy in languages for which there are more established writing standards, such as Guarani Mbya and Kaigang, but there can be a quite drastic drop in accuracy depending on the language. Then, we applied the LangID on a large set of texts, about 13M sentences from the Portuguese Wikipedia, towards understanding the difficulty factors may come out of such task in practice. The main outcome is that the lack of handling other American indigenous languages can affect considerably the precision for BILs, suggesting the need of a joint effort with related languages from the Americas.</abstract>
<identifier type="citekey">cavalin-etal-2023-understanding</identifier>
<identifier type="doi">10.18653/v1/2023.americasnlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.americasnlp-1.3</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>12</start>
<end>18</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding Native Language Identification for Brazilian Indigenous Languages
%A Cavalin, Paulo
%A Domingues, Pedro
%A Nogima, Julio
%A Pinhanez, Claudio
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Oncevay, Arturo
%Y Rice, Enora
%Y Rijhwani, Shruti
%Y Palmer, Alexis
%Y Kann, Katharina
%S Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F cavalin-etal-2023-understanding
%X We investigate native language identification (LangID) for Brazilian Indigenous Languages (BILs), using the Bible as training data. Our research extends from previous work, by presenting two analyses on the generalization of Bible-based LangID in non-biblical data. First, with newly collected non-biblical datasets, we show that such a LangID can still provide quite reasonable accuracy in languages for which there are more established writing standards, such as Guarani Mbya and Kaigang, but there can be a quite drastic drop in accuracy depending on the language. Then, we applied the LangID on a large set of texts, about 13M sentences from the Portuguese Wikipedia, towards understanding the difficulty factors may come out of such task in practice. The main outcome is that the lack of handling other American indigenous languages can affect considerably the precision for BILs, suggesting the need of a joint effort with related languages from the Americas.
%R 10.18653/v1/2023.americasnlp-1.3
%U https://aclanthology.org/2023.americasnlp-1.3
%U https://doi.org/10.18653/v1/2023.americasnlp-1.3
%P 12-18
Markdown (Informal)
[Understanding Native Language Identification for Brazilian Indigenous Languages](https://aclanthology.org/2023.americasnlp-1.3) (Cavalin et al., AmericasNLP 2023)
ACL