@inproceedings{mehri-shwartz-2023-automatic,
title = "Automatic Evaluation of Generative Models with Instruction Tuning",
author = "Mehri, Shuhaib and
Shwartz, Vered",
editor = "Gehrmann, Sebastian and
Wang, Alex and
Sedoc, Jo{\~a}o and
Clark, Elizabeth and
Dhole, Kaustubh and
Chandu, Khyathi Raghavi and
Santus, Enrico and
Sedghamiz, Hooman",
booktitle = "Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.gem-1.4",
pages = "42--52",
abstract = "Automatic evaluation of natural language generation has long been an elusive goal in NLP. A recent paradigm fine-tunes pre-trained language models to emulate human judgements for a particular task and evaluation criterion. Inspired by the generalization ability of instruction-tuned models, we propose a learned metric based on instruction tuning. To test our approach, we collected HEAP, a dataset of human judgements across various NLG tasks and evaluation criteria. Our findings demonstrate that instruction tuning language models on HEAP yields good performance on many evaluation tasks, though some criteria are less trivial to learn than others. Further, jointly training on multiple tasks can yield additional performance improvements, which can be beneficial for future tasks with little to no human annotated data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mehri-shwartz-2023-automatic">
<titleInfo>
<title>Automatic Evaluation of Generative Models with Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shuhaib</namePart>
<namePart type="family">Mehri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="given">Raghavi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hooman</namePart>
<namePart type="family">Sedghamiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic evaluation of natural language generation has long been an elusive goal in NLP. A recent paradigm fine-tunes pre-trained language models to emulate human judgements for a particular task and evaluation criterion. Inspired by the generalization ability of instruction-tuned models, we propose a learned metric based on instruction tuning. To test our approach, we collected HEAP, a dataset of human judgements across various NLG tasks and evaluation criteria. Our findings demonstrate that instruction tuning language models on HEAP yields good performance on many evaluation tasks, though some criteria are less trivial to learn than others. Further, jointly training on multiple tasks can yield additional performance improvements, which can be beneficial for future tasks with little to no human annotated data.</abstract>
<identifier type="citekey">mehri-shwartz-2023-automatic</identifier>
<location>
<url>https://aclanthology.org/2023.gem-1.4</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>42</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Evaluation of Generative Models with Instruction Tuning
%A Mehri, Shuhaib
%A Shwartz, Vered
%Y Gehrmann, Sebastian
%Y Wang, Alex
%Y Sedoc, João
%Y Clark, Elizabeth
%Y Dhole, Kaustubh
%Y Chandu, Khyathi Raghavi
%Y Santus, Enrico
%Y Sedghamiz, Hooman
%S Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F mehri-shwartz-2023-automatic
%X Automatic evaluation of natural language generation has long been an elusive goal in NLP. A recent paradigm fine-tunes pre-trained language models to emulate human judgements for a particular task and evaluation criterion. Inspired by the generalization ability of instruction-tuned models, we propose a learned metric based on instruction tuning. To test our approach, we collected HEAP, a dataset of human judgements across various NLG tasks and evaluation criteria. Our findings demonstrate that instruction tuning language models on HEAP yields good performance on many evaluation tasks, though some criteria are less trivial to learn than others. Further, jointly training on multiple tasks can yield additional performance improvements, which can be beneficial for future tasks with little to no human annotated data.
%U https://aclanthology.org/2023.gem-1.4
%P 42-52
Markdown (Informal)
[Automatic Evaluation of Generative Models with Instruction Tuning](https://aclanthology.org/2023.gem-1.4) (Mehri & Shwartz, GEM-WS 2023)
ACL