@inproceedings{basaldella-etal-2020-cometa,
title = "{COMETA}: A Corpus for Medical Entity Linking in the Social Media",
author = "Basaldella, Marco and
Liu, Fangyu and
Shareghi, Ehsan and
Collier, Nigel",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.253",
doi = "10.18653/v1/2020.emnlp-main.253",
pages = "3122--3137",
abstract = "Whilst there has been growing progress in Entity Linking (EL) for general language, existing datasets fail to address the complex nature of health terminology in layman{'}s language. Meanwhile, there is a growing need for applications that can understand the public{'}s voice in the health domain. To address this we introduce a new corpus called COMETA, consisting of 20k English biomedical entity mentions from Reddit expert-annotated with links to SNOMED CT, a widely-used medical knowledge graph. Our corpus satisfies a combination of desirable properties, from scale and coverage to diversity and quality, that to the best of our knowledge has not been met by any of the existing resources in the field. Through benchmark experiments on 20 EL baselines from string- to neural-based models we shed light on the ability of these systems to perform complex inference on entities and concepts under 2 challenging evaluation scenarios. Our experimental results on COMETA illustrate that no golden bullet exists and even the best mainstream techniques still have a significant performance gap to fill, while the best solution relies on combining different views of data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="basaldella-etal-2020-cometa">
<titleInfo>
<title>COMETA: A Corpus for Medical Entity Linking in the Social Media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Basaldella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fangyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nigel</namePart>
<namePart type="family">Collier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Whilst there has been growing progress in Entity Linking (EL) for general language, existing datasets fail to address the complex nature of health terminology in layman’s language. Meanwhile, there is a growing need for applications that can understand the public’s voice in the health domain. To address this we introduce a new corpus called COMETA, consisting of 20k English biomedical entity mentions from Reddit expert-annotated with links to SNOMED CT, a widely-used medical knowledge graph. Our corpus satisfies a combination of desirable properties, from scale and coverage to diversity and quality, that to the best of our knowledge has not been met by any of the existing resources in the field. Through benchmark experiments on 20 EL baselines from string- to neural-based models we shed light on the ability of these systems to perform complex inference on entities and concepts under 2 challenging evaluation scenarios. Our experimental results on COMETA illustrate that no golden bullet exists and even the best mainstream techniques still have a significant performance gap to fill, while the best solution relies on combining different views of data.</abstract>
<identifier type="citekey">basaldella-etal-2020-cometa</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.253</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.253</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>3122</start>
<end>3137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T COMETA: A Corpus for Medical Entity Linking in the Social Media
%A Basaldella, Marco
%A Liu, Fangyu
%A Shareghi, Ehsan
%A Collier, Nigel
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F basaldella-etal-2020-cometa
%X Whilst there has been growing progress in Entity Linking (EL) for general language, existing datasets fail to address the complex nature of health terminology in layman’s language. Meanwhile, there is a growing need for applications that can understand the public’s voice in the health domain. To address this we introduce a new corpus called COMETA, consisting of 20k English biomedical entity mentions from Reddit expert-annotated with links to SNOMED CT, a widely-used medical knowledge graph. Our corpus satisfies a combination of desirable properties, from scale and coverage to diversity and quality, that to the best of our knowledge has not been met by any of the existing resources in the field. Through benchmark experiments on 20 EL baselines from string- to neural-based models we shed light on the ability of these systems to perform complex inference on entities and concepts under 2 challenging evaluation scenarios. Our experimental results on COMETA illustrate that no golden bullet exists and even the best mainstream techniques still have a significant performance gap to fill, while the best solution relies on combining different views of data.
%R 10.18653/v1/2020.emnlp-main.253
%U https://aclanthology.org/2020.emnlp-main.253
%U https://doi.org/10.18653/v1/2020.emnlp-main.253
%P 3122-3137
Markdown (Informal)
[COMETA: A Corpus for Medical Entity Linking in the Social Media](https://aclanthology.org/2020.emnlp-main.253) (Basaldella et al., EMNLP 2020)
ACL