@inproceedings{byamugisha-2020-generating,
title = "Generating Varied Training Corpora in {R}unyankore Using a Combined Semantic and Syntactic, Pattern-Grammar-based Approach",
author = "Byamugisha, Joan",
editor = "Davis, Brian and
Graham, Yvette and
Kelleher, John and
Sripada, Yaji",
booktitle = "Proceedings of the 13th International Conference on Natural Language Generation",
month = dec,
year = "2020",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.inlg-1.34/",
doi = "10.18653/v1/2020.inlg-1.34",
pages = "273--282",
abstract = "Machine learning algorithms have been applied to achieve high levels of accuracy in tasks associated with the processing of natural language. However, these algorithms require large amounts of training data in order to perform efficiently. Since most Bantu languages lack the required training corpora because they are computationally under-resourced, we investigated how to generate a large varied training corpus in Runyankore, a Bantu language indigenous to Uganda. We found the use of a combined semantic and syntactic, pattern and grammar-based approach to be applicable to this purpose, and used it to generate one million sentences, both labelled and unlabelled, which can be applied as training data for machine learning algorithms. The generated text was evaluated in two ways: (1) assessing the semantics encoded in word embeddings obtained from the generated text, which showed correct word similarity; and (2) applying the labelled data to tasks such as sentiment analysis, which achieved satisfactory levels of accuracy."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="byamugisha-2020-generating">
<titleInfo>
<title>Generating Varied Training Corpora in Runyankore Using a Combined Semantic and Syntactic, Pattern-Grammar-based Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joan</namePart>
<namePart type="family">Byamugisha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Kelleher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaji</namePart>
<namePart type="family">Sripada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Machine learning algorithms have been applied to achieve high levels of accuracy in tasks associated with the processing of natural language. However, these algorithms require large amounts of training data in order to perform efficiently. Since most Bantu languages lack the required training corpora because they are computationally under-resourced, we investigated how to generate a large varied training corpus in Runyankore, a Bantu language indigenous to Uganda. We found the use of a combined semantic and syntactic, pattern and grammar-based approach to be applicable to this purpose, and used it to generate one million sentences, both labelled and unlabelled, which can be applied as training data for machine learning algorithms. The generated text was evaluated in two ways: (1) assessing the semantics encoded in word embeddings obtained from the generated text, which showed correct word similarity; and (2) applying the labelled data to tasks such as sentiment analysis, which achieved satisfactory levels of accuracy.</abstract>
<identifier type="citekey">byamugisha-2020-generating</identifier>
<identifier type="doi">10.18653/v1/2020.inlg-1.34</identifier>
<location>
<url>https://aclanthology.org/2020.inlg-1.34/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>273</start>
<end>282</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Varied Training Corpora in Runyankore Using a Combined Semantic and Syntactic, Pattern-Grammar-based Approach
%A Byamugisha, Joan
%Y Davis, Brian
%Y Graham, Yvette
%Y Kelleher, John
%Y Sripada, Yaji
%S Proceedings of the 13th International Conference on Natural Language Generation
%D 2020
%8 December
%I Association for Computational Linguistics
%C Dublin, Ireland
%F byamugisha-2020-generating
%X Machine learning algorithms have been applied to achieve high levels of accuracy in tasks associated with the processing of natural language. However, these algorithms require large amounts of training data in order to perform efficiently. Since most Bantu languages lack the required training corpora because they are computationally under-resourced, we investigated how to generate a large varied training corpus in Runyankore, a Bantu language indigenous to Uganda. We found the use of a combined semantic and syntactic, pattern and grammar-based approach to be applicable to this purpose, and used it to generate one million sentences, both labelled and unlabelled, which can be applied as training data for machine learning algorithms. The generated text was evaluated in two ways: (1) assessing the semantics encoded in word embeddings obtained from the generated text, which showed correct word similarity; and (2) applying the labelled data to tasks such as sentiment analysis, which achieved satisfactory levels of accuracy.
%R 10.18653/v1/2020.inlg-1.34
%U https://aclanthology.org/2020.inlg-1.34/
%U https://doi.org/10.18653/v1/2020.inlg-1.34
%P 273-282
Markdown (Informal)
[Generating Varied Training Corpora in Runyankore Using a Combined Semantic and Syntactic, Pattern-Grammar-based Approach](https://aclanthology.org/2020.inlg-1.34/) (Byamugisha, INLG 2020)
ACL