This repository has been archived by the owner on Jun 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_wiki_corpus.py
88 lines (70 loc) · 2.46 KB
/
make_wiki_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from lxml import etree
import json
import os
import glob
import wiki_utils as utils
import pickle_utils as pkl
import classes
from config import Config, create_dir
# Specify your config file here:
cfg = Config('cfg/wikinews.yml')
cfg.setup_input()
input_dir = "{}/input_docs".format(cfg.experiment_dir)
create_dir(input_dir)
# ------------------------------------------------------
# Loading of configuration variables
max_docs = cfg.max_documents
min_length=cfg.min_text_length
# Define some hard-coded areas of the files
marker_base='{http://www.mediawiki.org/xml/export-0.10/}'
title_marker=f'{marker_base}title'
page_marker=f'{marker_base}page'
text_marker=f'{marker_base}revision/{marker_base}text'
skip_starters=['H:', 'WN:', 'Sjabloon:', 'Bestand:', 'Help:', 'Module:', 'Categorie:', 'MediaWiki:', 'Wikinieuws:']
skip_contains=['Nieuwsbrief Wikimedia Nederland']
# Prepare variables and clean directories
news_items = set()
for f in glob.glob('%s/*.json' % input_dir):
os.remove(f)
counter = 1
# Parse input XML file
tree = etree.parse(cfg.raw_input)
root = tree.getroot()
for x in root.findall(page_marker):
title = x.find(title_marker).text
# check is this a title we want to keep or skip
for dont_start in skip_starters:
if title.startswith(dont_start):
continue
for dont_contain in skip_contains:
if dont_contain in title:
continue
# load text
text = x.find(text_marker).text
docid = 'wiki_%d' % counter
# Create a news item object with the information from the text and the links
the_text, the_links = utils.get_text_and_links(text)
clean_text = utils.clean_wiki(the_text)
clean_title = utils.clean_wiki(title)
if len(clean_text) < min_length:
continue
# TODO create_gold_mentions is deprecated (needs to be improved)
news_item_obj = classes.NewsItem(
content=clean_text,
title=clean_title,
identifier=docid,
collection=cfg.corpus_name
#gold_entity_mentions=utils.create_gold_mentions(the_links,
# clean_text)
)
news_items.add(news_item_obj)
# Save it to JSON
j = {'title': clean_title,
'body': clean_text}
with open('%s/%s.json' % (input_dir, docid), 'w') as outfile:
json.dump(j, outfile)
if max_docs and counter >= max_docs:
break
counter += 1
# Save the classes as pickle
pkl.save_news_items(cfg.news_items_file(), news_items)