Bibtex de la publication

@InProceedings{ KoBoPi2011.2,
author = {Kopliku, Arlind and Boughanem, Mohand and Pinel-Sauvagnat, Karen},
title = "{Mining the Web for lists of Named Entities (short paper)}",
booktitle = "{Conférence francophone en Recherche d'Information et Applications (CORIA), Avignon, 16/03/2011-18/03/2011}",
year = {2011},
month = {mars},
publisher = {Association Francophone de Recherche d'Information et Applications (ARIA)},
address = {},
pages = {113--120},
language = {anglais},
URL = {},
keywords = {SIGRI, Information Extraction, named entities, HTML list, Information Retrieval},
note = {TauxAcceptation : (t=70, l=16, c=13), (t=41.4%, l=23%, c=18%)},
abstract = {Named entities play an important role in Information extraction. They represent unitary namable information within text. In this work, we focus on groups of named entities of the same type which we try to extract from HTML lists. Instead of starting from a class and identifying the corresponding named entities, we want to explore a new paradigm which consists in identifying sets of named entities without any knowledge on the class. A clear advantage of the approach is that it is applicable to all named entities (no matter what class), which makes it domain independent. We use HTML lists to collect candidate sets of named entities. Human assessors assessed a randomly selected sample of HTML lists. 8,25% of these HTML lists are lists of named entities of the same class. If our estimation is validated at large scale, it is possible to expect at least 890 million of such lists of named entities only in the indexed Web. Moreover, we propose an appropriate classifier which shows promising results. }