@inproceedings{culy-riehemann-2003-limits,
title = "The limits of n-gram translation evaluation metrics",
author = "Culy, Christopher and
Riehemann, Susanne Z.",
booktitle = "Proceedings of Machine Translation Summit IX: Papers",
month = sep # " 23-27",
year = "2003",
address = "New Orleans, USA",
url = "https://aclanthology.org/2003.mtsummit-papers.10",
abstract = "N-gram measures of translation quality, such as BLEU and the related NIST metric, are becoming increasingly important in machine translation, yet their behaviors are not fully understood. In this paper we examine the performance of these metrics on professional human translations into German of two literary genres, the Bible and Tom Sawyer. The most surprising result is that some machine translations outscore some professional human translations. In addition, it can be difficult to distinguish some other human translations from machine translations with only two reference translations; with four reference translations it is much easier. Our results lead us to conclude that much care must be taken in using n-gram measures in formal evaluations of machine translation quality, though they are still valuable as part of the iterative development cycle.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="culy-riehemann-2003-limits">
<titleInfo>
<title>The limits of n-gram translation evaluation metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Culy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Susanne</namePart>
<namePart type="given">Z</namePart>
<namePart type="family">Riehemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2003-sep 23-27</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit IX: Papers</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">New Orleans, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>N-gram measures of translation quality, such as BLEU and the related NIST metric, are becoming increasingly important in machine translation, yet their behaviors are not fully understood. In this paper we examine the performance of these metrics on professional human translations into German of two literary genres, the Bible and Tom Sawyer. The most surprising result is that some machine translations outscore some professional human translations. In addition, it can be difficult to distinguish some other human translations from machine translations with only two reference translations; with four reference translations it is much easier. Our results lead us to conclude that much care must be taken in using n-gram measures in formal evaluations of machine translation quality, though they are still valuable as part of the iterative development cycle.</abstract>
<identifier type="citekey">culy-riehemann-2003-limits</identifier>
<location>
<url>https://aclanthology.org/2003.mtsummit-papers.10</url>
</location>
<part>
<date>2003-sep 23-27</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The limits of n-gram translation evaluation metrics
%A Culy, Christopher
%A Riehemann, Susanne Z.
%S Proceedings of Machine Translation Summit IX: Papers
%D 2003
%8 sep 23 27
%C New Orleans, USA
%F culy-riehemann-2003-limits
%X N-gram measures of translation quality, such as BLEU and the related NIST metric, are becoming increasingly important in machine translation, yet their behaviors are not fully understood. In this paper we examine the performance of these metrics on professional human translations into German of two literary genres, the Bible and Tom Sawyer. The most surprising result is that some machine translations outscore some professional human translations. In addition, it can be difficult to distinguish some other human translations from machine translations with only two reference translations; with four reference translations it is much easier. Our results lead us to conclude that much care must be taken in using n-gram measures in formal evaluations of machine translation quality, though they are still valuable as part of the iterative development cycle.
%U https://aclanthology.org/2003.mtsummit-papers.10
Markdown (Informal)
[The limits of n-gram translation evaluation metrics](https://aclanthology.org/2003.mtsummit-papers.10) (Culy & Riehemann, MTSummit 2003)
ACL