@article{chen-eger-2023-menli,
title = "{MENLI}: Robust Evaluation Metrics from Natural Language Inference",
author = "Chen, Yanran and
Eger, Steffen",
journal = "Transactions of the Association for Computational Linguistics",
volume = "11",
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.tacl-1.47",
doi = "10.1162/tacl_a_00576",
pages = "804--825",
abstract = "Recently proposed BERT-based evaluation metrics for text generation perform well on standard benchmarks but are vulnerable to adversarial attacks, e.g., relating to information correctness. We argue that this stems (in part) from the fact that they are models of semantic similarity. In contrast, we develop evaluation metrics based on Natural Language Inference (NLI), which we deem a more appropriate modeling. We design a preference-based adversarial attack framework and show that our NLI based metrics are much more robust to the attacks than the recent BERT-based metrics. On standard benchmarks, our NLI based metrics outperform existing summarization metrics, but perform below SOTA MT metrics. However, when combining existing metrics with our NLI metrics, we obtain both higher adversarial robustness (15{\%}{--}30{\%}) and higher quality metrics as measured on standard benchmarks (+5{\%} to 30{\%}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-eger-2023-menli">
<titleInfo>
<title>MENLI: Robust Evaluation Metrics from Natural Language Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanran</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Recently proposed BERT-based evaluation metrics for text generation perform well on standard benchmarks but are vulnerable to adversarial attacks, e.g., relating to information correctness. We argue that this stems (in part) from the fact that they are models of semantic similarity. In contrast, we develop evaluation metrics based on Natural Language Inference (NLI), which we deem a more appropriate modeling. We design a preference-based adversarial attack framework and show that our NLI based metrics are much more robust to the attacks than the recent BERT-based metrics. On standard benchmarks, our NLI based metrics outperform existing summarization metrics, but perform below SOTA MT metrics. However, when combining existing metrics with our NLI metrics, we obtain both higher adversarial robustness (15%–30%) and higher quality metrics as measured on standard benchmarks (+5% to 30%).</abstract>
<identifier type="citekey">chen-eger-2023-menli</identifier>
<identifier type="doi">10.1162/tacl_a_00576</identifier>
<location>
<url>https://aclanthology.org/2023.tacl-1.47</url>
</location>
<part>
<date>2023</date>
<detail type="volume"><number>11</number></detail>
<extent unit="page">
<start>804</start>
<end>825</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T MENLI: Robust Evaluation Metrics from Natural Language Inference
%A Chen, Yanran
%A Eger, Steffen
%J Transactions of the Association for Computational Linguistics
%D 2023
%V 11
%I MIT Press
%C Cambridge, MA
%F chen-eger-2023-menli
%X Recently proposed BERT-based evaluation metrics for text generation perform well on standard benchmarks but are vulnerable to adversarial attacks, e.g., relating to information correctness. We argue that this stems (in part) from the fact that they are models of semantic similarity. In contrast, we develop evaluation metrics based on Natural Language Inference (NLI), which we deem a more appropriate modeling. We design a preference-based adversarial attack framework and show that our NLI based metrics are much more robust to the attacks than the recent BERT-based metrics. On standard benchmarks, our NLI based metrics outperform existing summarization metrics, but perform below SOTA MT metrics. However, when combining existing metrics with our NLI metrics, we obtain both higher adversarial robustness (15%–30%) and higher quality metrics as measured on standard benchmarks (+5% to 30%).
%R 10.1162/tacl_a_00576
%U https://aclanthology.org/2023.tacl-1.47
%U https://doi.org/10.1162/tacl_a_00576
%P 804-825
Markdown (Informal)
[MENLI: Robust Evaluation Metrics from Natural Language Inference](https://aclanthology.org/2023.tacl-1.47) (Chen & Eger, TACL 2023)
ACL