-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiki_parse.py
235 lines (173 loc) · 6.68 KB
/
wiki_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from urllib.parse import urlparse
from dataclasses import dataclass
from typing import List, Tuple, Union
import vid_def
import wiki_api
import wikitextparser as wtp
def get_article_title_from_url(url: str) -> str:
"""
Extracts the Wikipedia article title from a Wikipedia URL.
This returns the URL in unescaped form, i.e., Like_This.
If the URL is not in the format en.wikipedia.org/wiki/<title>,
a ValueError is raised.
"""
# url parse doesn't properly parse the domain if a protocol isn't present
if not url.startswith("http"):
url = "http://" + url
result = urlparse(url)
if result.netloc != "en.wikipedia.org" or not result.path.startswith("/wiki/"):
raise ValueError(
"The article URL must have the format en.wikipedia.org/wiki/<title>."
)
return result.path.replace("/wiki/", "")
def parse_article_wikitext(article_title: str) -> wtp.WikiText:
"""
Requests and parses the Wikipedia article named article_title.
"""
wikitext = wiki_api.get_article_wikitext(article_title)
parsed = wtp.parse(wikitext)
return parsed
def escape_article_title(title: str) -> str:
"""
Escape bare wikilink article titles so we can pass them into the API.
This is almost certainly broken.
"""
return title.replace(" ", "_")
def unescape_article_title(title: str) -> str:
"""
Unescape article titles for display.
As the inverse of above, this is also probably broken.
"""
return title.replace("_", " ")
@dataclass
class VideoItem:
# The name of the item
name: str
# The title of the article that this item represents
article_title: str
def video_item_from_wikilink(wikilink: wtp.WikiLink) -> VideoItem:
return VideoItem(wikilink.title, escape_article_title(wikilink.target))
def extract_list(wikilist: wtp.WikiList) -> List[VideoItem]:
"""
Extracts ListItems from a WikiList. We use some heuristics:
- the FIRST Wikilink in a list item is the main article it refers to
- if a list contains no Wikilinks, it doesn't contain any data we're interested in
so we return an empty list.
"""
video_items = []
for item in wikilist.items:
# items are returned as string for some reason, so we have to reparse
wikilinks = wtp.parse(item).wikilinks
if len(wikilinks) > 0:
video_items.append(video_item_from_wikilink(wikilinks[0]))
# recursively parse any sublists
for sub_list in wikilist.get_lists():
video_items.extend(extract_list(sub_list))
return video_items
# If a column has more than ACCEPTABLE_UNIQUE_FRACTION unique items, it's eligible
ACCEPTABLE_UNIQUE_FRACTION = 0.9
# Some columns have very few links, but as long as there are a few, the column should be eligible
ACCEPTABLE_LINK_FRACTION = 0.1
def parse_column(
table: wtp.Table, column_idx: int, n_rows: int
) -> Tuple[List[wtp.WikiLink], bool]:
"""
Parse a column of a WikiText table. For performance, we return both a list of wikilinks
and a boolean indicating eligibility.
"""
unique_links = dict()
n_links = 0
for row in range(n_rows):
cell = table.cells(row, column_idx)
# skip cells without links
if cell is None or len(cell.wikilinks) == 0:
continue
# assume the first link is the only link
link = cell.wikilinks[0]
# links to files don't count
if link.target.startswith("File:"):
continue
unique_links[link.title] = link
n_links += 1
# If there are no links we can return early
if n_links == 0:
return [], False
# determine eligiblity
unique_fraction = len(unique_links) / n_links
link_fraction = n_links / n_rows
eligibility = (
unique_fraction > ACCEPTABLE_UNIQUE_FRACTION
and link_fraction > ACCEPTABLE_LINK_FRACTION
)
return list(unique_links.values()), eligibility
def extract_table(table: wtp.Table) -> List[VideoItem]:
"""
Extracts ListItems from a table. Again, we use some heuristics:
- we return all the wikilinks from the first 'eligible column', where:
- an eligible column has 'mostly' unique data
- an eligible column has 'mostly' wikilinks
"""
video_items = []
n_rows = len(table.data())
if n_rows == 0:
# this table is messed up, skip it
return []
n_columns = len(table.data()[0])
for i in range(n_columns):
links, eligible = parse_column(table, i, n_rows)
if eligible:
video_items.extend(map(video_item_from_wikilink, links))
break
return video_items
def extract_section(section: wtp.Section) -> List[VideoItem]:
video_items = []
# according to docs, this weird pattern will flatten lists
for l in section.get_lists("\*+"):
video_items.extend(extract_list(l))
for t in section.get_tables():
video_items.extend(extract_table(t))
return video_items
# Common Wikipedia sections that don't contain useful content
SECTION_BLACKLIST = ["See also", "Notes", "References"]
def extract_video_items(parsed: wtp.WikiText) -> List[VideoItem]:
video_items = []
for s in parsed.get_sections(include_subsections=True):
# skip the See Also section, which can contain lists
if s.title is not None and s.title.strip() in SECTION_BLACKLIST:
continue
video_items.extend(extract_section(s))
return video_items
MAX_CHARS = 250
def clean_extract(extract: str) -> str:
"""
Do some cleaning work on an article extract from the Wikipedia API.
We remove any partial sentences that might be at the end of the extract,
as well as remove any newlines, and unescape apostrophes.
"""
# poor man's sentence boundary detection
try:
last_period = extract.rindex(". ")
extract = extract[: last_period + 1]
except ValueError:
# no leftover sentence, whatever
pass
extract = extract.replace("\n", "")
extract = extract.replace(r"\'", r"'")
return extract
def segment_from_video_item(item: VideoItem) -> Union["vid_def.Segment", None]:
"""
Builds a segment from the given video item.
"""
segment_desc = clean_extract(wiki_api.get_article_extract(item.article_title))
image_url = wiki_api.get_article_image_url(item.article_title)
if image_url is None:
image_url = wiki_api.get_fallback_article_image_url(item.article_title)
# We really didn't find anything, turf this item
if image_url is None:
return None
return vid_def.Segment(
name=item.name,
description=segment_desc,
image_url=image_url,
article_url=wiki_api.get_url_from_article_title(item.article_title),
)