This repository has been archived by the owner on Jun 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_nif_corpus.py
84 lines (74 loc) · 2.94 KB
/
make_nif_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from rdflib import Graph, URIRef
import sys
import pickle_utils as pkl
from config import Config
import classes
# ------ NIF datasets loader ---------------------
def load_article_from_nif_file(nif_file, corpus_name, limit=1000000):
"""
Load a dataset in NIF format.
"""
print(f'NOW LOADING THE NIF FILE {nif_file}')
g = Graph()
#for nif_file in glob.glob('%s/*.ttl' % nif_dir):
g.parse(nif_file, format="n3")
print(f'THE FILE {nif_file} IS LOADED. NOW QUERYING')
news_items = set()
articles = g.query(
""" SELECT ?articleid ?date ?string
WHERE {
?articleid nif:isString ?string .
OPTIONAL { ?articleid <http://purl.org/dc/elements/1.1/date> ?date . }
}
LIMIT %d""" % limit)
for article in articles:
doc_id = article['articleid'].replace('http://nl.dbpedia.org/resource/', '').split('/')[0]
query = """ SELECT ?id ?mention ?start ?end ?gold
WHERE {
?id nif:anchorOf ?mention ;
nif:beginIndex ?start ;
nif:endIndex ?end ;
nif:referenceContext <%s> .
OPTIONAL { ?id itsrdf:taIdentRef ?gold . }
} ORDER BY ?start""" % str(article['articleid'])
qres_entities = g.query(query)
all_entities=[]
for eid, entity in enumerate(qres_entities):
gold_link = str(entity['gold']) # utils.getLinkRedirect(utils.normalizeURL(str(entity['gold'])))
if gold_link.startswith('http://aksw.org/notInWiki'):
gold_link = '--NME--'
entity_obj = classes.EntityMention(
begin_offset=int(entity['start']),
end_offset=int(entity['end']),
mention=str(entity['mention']),
identity=gold_link,
eid=f'e{eid}'
)
all_entities.append(entity_obj)
news_item_obj = classes.NewsItem(
content=article['string'],
identifier=doc_id, # "http://yovisto.com/resource/dataset/iswc2015/doc/281#char=0,4239",
dct=article['date'],
collection=corpus_name,
title='',
gold_entity_mentions=all_entities
)
print(article['articleid'], len(news_item_obj.gold_entity_mentions))
news_items.add(news_item_obj)
return news_items
# Specify your config file here:
if len(sys.argv) < 2:
print("Missing config file argument. Now exiting. Usage:")
print("python make_nif_corpus.py {config_file}")
exit(1)
cfg = Config(sys.argv[1])
cfg.setup_input()
# Load configuration variables
# min_length = cfg.min_text_length
# Load a number of news items from a NIF file
news_items = load_article_from_nif_file(cfg.raw_input,
cfg.corpus_name,
limit=cfg.max_documents or 1000000)
# Save the news articles to pickle
print(cfg.news_items_file())
pkl.save_news_items(cfg.news_items_file(), news_items)