@inproceedings{kusampudi-etal-2021-corpus,
title = "Corpus Creation and Language Identification in Low-Resource Code-Mixed {T}elugu-{E}nglish Text",
author = "Kusampudi, Siva Subrahamanyam Varma and
Chaluvadi, Anudeep and
Mamidi, Radhika",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.85",
pages = "744--752",
abstract = "Code-Mixing (CM) is a common phenomenon in multilingual societies. CM plays a significant role in technology and medical fields where terminologies in the native language are not available or known. Language Identification (LID) of the CM data will help solve NLP tasks such as Spell Checking, Named Entity Recognition, Part-Of-Speech tagging, and Semantic Parsing. In the current era of machine learning, a common problem to the above-mentioned tasks is the availability of Learning data to train models. In this paper, we introduce two Telugu-English CM manually annotated datasets (Twitter dataset and Blog dataset). The Twitter dataset contains more romanization variability and misspelled words than the blog dataset. We compare across various classification models and perform extensive bench-marking using both Classical and Deep Learning Models for LID compared to existing models. We propose two architectures for language classification (Telugu and English) in CM data: (1) Word Level Classification (2) Sentence Level word-by-word Classification and compare these approaches presenting two strong baselines for LID on these datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kusampudi-etal-2021-corpus">
<titleInfo>
<title>Corpus Creation and Language Identification in Low-Resource Code-Mixed Telugu-English Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siva</namePart>
<namePart type="given">Subrahamanyam</namePart>
<namePart type="given">Varma</namePart>
<namePart type="family">Kusampudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anudeep</namePart>
<namePart type="family">Chaluvadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radhika</namePart>
<namePart type="family">Mamidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-Mixing (CM) is a common phenomenon in multilingual societies. CM plays a significant role in technology and medical fields where terminologies in the native language are not available or known. Language Identification (LID) of the CM data will help solve NLP tasks such as Spell Checking, Named Entity Recognition, Part-Of-Speech tagging, and Semantic Parsing. In the current era of machine learning, a common problem to the above-mentioned tasks is the availability of Learning data to train models. In this paper, we introduce two Telugu-English CM manually annotated datasets (Twitter dataset and Blog dataset). The Twitter dataset contains more romanization variability and misspelled words than the blog dataset. We compare across various classification models and perform extensive bench-marking using both Classical and Deep Learning Models for LID compared to existing models. We propose two architectures for language classification (Telugu and English) in CM data: (1) Word Level Classification (2) Sentence Level word-by-word Classification and compare these approaches presenting two strong baselines for LID on these datasets.</abstract>
<identifier type="citekey">kusampudi-etal-2021-corpus</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.85</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>744</start>
<end>752</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Corpus Creation and Language Identification in Low-Resource Code-Mixed Telugu-English Text
%A Kusampudi, Siva Subrahamanyam Varma
%A Chaluvadi, Anudeep
%A Mamidi, Radhika
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F kusampudi-etal-2021-corpus
%X Code-Mixing (CM) is a common phenomenon in multilingual societies. CM plays a significant role in technology and medical fields where terminologies in the native language are not available or known. Language Identification (LID) of the CM data will help solve NLP tasks such as Spell Checking, Named Entity Recognition, Part-Of-Speech tagging, and Semantic Parsing. In the current era of machine learning, a common problem to the above-mentioned tasks is the availability of Learning data to train models. In this paper, we introduce two Telugu-English CM manually annotated datasets (Twitter dataset and Blog dataset). The Twitter dataset contains more romanization variability and misspelled words than the blog dataset. We compare across various classification models and perform extensive bench-marking using both Classical and Deep Learning Models for LID compared to existing models. We propose two architectures for language classification (Telugu and English) in CM data: (1) Word Level Classification (2) Sentence Level word-by-word Classification and compare these approaches presenting two strong baselines for LID on these datasets.
%U https://aclanthology.org/2021.ranlp-1.85
%P 744-752
Markdown (Informal)
[Corpus Creation and Language Identification in Low-Resource Code-Mixed Telugu-English Text](https://aclanthology.org/2021.ranlp-1.85) (Kusampudi et al., RANLP 2021)
ACL