@inproceedings{malema-etal-2020-complex,
title = "Complex Setswana Parts of Speech Tagging",
author = "Malema, Gabofetswe and
Okgetheng, Boago and
Tebalo, Bopaki and
Motlhanka, Moffat and
Rammidi, Goaletsa",
editor = "Mabuya, Rooweither and
Ramukhadi, Phathutshedzo and
Setaka, Mmasibidi and
Wagner, Valencia and
van Zaanen, Menno",
booktitle = "Proceedings of the first workshop on Resources for African Indigenous Languages",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/2020.rail-1.4",
pages = "21--24",
abstract = "Setswana language is one of the Bantu languages written disjunctively. Some of its parts of speech such as qualificatives and some adverbs are made up of multiple words. That is, the part of speech is made up of a group of words. The disjunctive style of writing poses a challenge when a sentence is tokenized or when tagging. A few studies have been done on identification of multi-word parts of speech. In this study we go further to tokenize complex parts of speech which are formed by extending basic forms of multi-word parts of speech. The parts of speech are extended by recursively concatenating more parts of speech to a basic form of parts of speech. We developed rules for building complex relative parts of speech. A morphological analyzer and Python NLTK are used to tag individual words and basic forms of multi-word parts of speech. Developed rules are then used to identify complex parts of speech. Results from a 300 sentence text files give a performance of 74{\%}. The tagger fails when it encounters expansion rules not implemented and when tagging by the morphological analyzer is incorrect.",
language = "English",
ISBN = "979-10-95546-60-3",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="malema-etal-2020-complex">
<titleInfo>
<title>Complex Setswana Parts of Speech Tagging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gabofetswe</namePart>
<namePart type="family">Malema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boago</namePart>
<namePart type="family">Okgetheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bopaki</namePart>
<namePart type="family">Tebalo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moffat</namePart>
<namePart type="family">Motlhanka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goaletsa</namePart>
<namePart type="family">Rammidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the first workshop on Resources for African Indigenous Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rooweither</namePart>
<namePart type="family">Mabuya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phathutshedzo</namePart>
<namePart type="family">Ramukhadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mmasibidi</namePart>
<namePart type="family">Setaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valencia</namePart>
<namePart type="family">Wagner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Menno</namePart>
<namePart type="family">van Zaanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-60-3</identifier>
</relatedItem>
<abstract>Setswana language is one of the Bantu languages written disjunctively. Some of its parts of speech such as qualificatives and some adverbs are made up of multiple words. That is, the part of speech is made up of a group of words. The disjunctive style of writing poses a challenge when a sentence is tokenized or when tagging. A few studies have been done on identification of multi-word parts of speech. In this study we go further to tokenize complex parts of speech which are formed by extending basic forms of multi-word parts of speech. The parts of speech are extended by recursively concatenating more parts of speech to a basic form of parts of speech. We developed rules for building complex relative parts of speech. A morphological analyzer and Python NLTK are used to tag individual words and basic forms of multi-word parts of speech. Developed rules are then used to identify complex parts of speech. Results from a 300 sentence text files give a performance of 74%. The tagger fails when it encounters expansion rules not implemented and when tagging by the morphological analyzer is incorrect.</abstract>
<identifier type="citekey">malema-etal-2020-complex</identifier>
<location>
<url>https://aclanthology.org/2020.rail-1.4</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>21</start>
<end>24</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Complex Setswana Parts of Speech Tagging
%A Malema, Gabofetswe
%A Okgetheng, Boago
%A Tebalo, Bopaki
%A Motlhanka, Moffat
%A Rammidi, Goaletsa
%Y Mabuya, Rooweither
%Y Ramukhadi, Phathutshedzo
%Y Setaka, Mmasibidi
%Y Wagner, Valencia
%Y van Zaanen, Menno
%S Proceedings of the first workshop on Resources for African Indigenous Languages
%D 2020
%8 May
%I European Language Resources Association (ELRA)
%C Marseille, France
%@ 979-10-95546-60-3
%G English
%F malema-etal-2020-complex
%X Setswana language is one of the Bantu languages written disjunctively. Some of its parts of speech such as qualificatives and some adverbs are made up of multiple words. That is, the part of speech is made up of a group of words. The disjunctive style of writing poses a challenge when a sentence is tokenized or when tagging. A few studies have been done on identification of multi-word parts of speech. In this study we go further to tokenize complex parts of speech which are formed by extending basic forms of multi-word parts of speech. The parts of speech are extended by recursively concatenating more parts of speech to a basic form of parts of speech. We developed rules for building complex relative parts of speech. A morphological analyzer and Python NLTK are used to tag individual words and basic forms of multi-word parts of speech. Developed rules are then used to identify complex parts of speech. Results from a 300 sentence text files give a performance of 74%. The tagger fails when it encounters expansion rules not implemented and when tagging by the morphological analyzer is incorrect.
%U https://aclanthology.org/2020.rail-1.4
%P 21-24
Markdown (Informal)
[Complex Setswana Parts of Speech Tagging](https://aclanthology.org/2020.rail-1.4) (Malema et al., RAIL 2020)
ACL
- Gabofetswe Malema, Boago Okgetheng, Bopaki Tebalo, Moffat Motlhanka, and Goaletsa Rammidi. 2020. Complex Setswana Parts of Speech Tagging. In Proceedings of the first workshop on Resources for African Indigenous Languages, pages 21–24, Marseille, France. European Language Resources Association (ELRA).