-
Notifications
You must be signed in to change notification settings - Fork 17
/
locations.py
executable file
·255 lines (208 loc) · 6.18 KB
/
locations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""
Greynir: Natural language processing for Icelandic
Copyright (C) 2023 Miðeind ehf.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
This module implements a processor that extracts any addresses / locations
in article tokens, looks up information about them and saves to a database.
"""
from typing import List
from datetime import datetime, timezone
from db.models import Location
from tokenizer import TOK
from geo import location_info
from tree import TreeStateDict, Loc
from tree.util import TokenDict
MODULE_NAME = __name__
PROCESSOR_TYPE = "token"
LOCFL = ["lönd", "göt", "örn", "borg"]
LOCFL_TO_KIND = dict(zip(LOCFL, ["country", "street", "placename", "placename"]))
# Always identify these words as locations, even when they
# have been identified as words in some other category.
ALWAYS_LOCATION = frozenset(
(
"París", # also ism in BÍN
# "Aþena", # ism
# "Árborg", # ism
# "Borg", # ism
# "Hella", # ism
)
)
# GENERAL_BLACKLIST = frozenset(())
PLACENAME_BLACKLIST = frozenset(
(
"Sámur",
"Staður",
"Eyjan",
"Eyja",
"Fjöll",
"Bæir",
"Bær",
"Rauða",
"Hjálp",
"Stjórn",
"Hrun",
"Hrunið",
"Mark",
"Bás",
"Vatnið",
"Vatn",
"Á",
"Flag",
"Stigi",
"Kjarni",
"Hagar",
"Þing",
"Langa",
"Hús",
"Kirkjan",
"Kirkja",
"Maður",
"Systur",
"Pallar",
"Snið",
"Stöð",
"Síða",
"Síðan",
"Hundruð",
"Hestur",
"Skipti",
"Skólinn",
"Skurður",
"Gat",
"Eik",
"Hlíf",
"Karl",
"Félagar",
"Lækur",
"Síðan",
"Lægðin",
"Prestur",
"Paradís",
"Lón",
"Land",
"Gil",
"Höllin",
"Höll",
"Fjórðungur",
"Grænur",
"Hagi",
"Brenna",
"Hraun",
"Hagar",
"Opnur",
"Guðfinna", # !
"Svið",
"Öxi",
"Skyggnir",
"Egg",
"Toppar",
"Toppur",
"Einkunn",
"Borgir",
"Langur",
"Drög",
"Haf",
"Fossar",
"Stuðlar",
"Straumur",
"Eden",
"Haft",
"Rétt",
"Veitur",
"Örkin",
"Svangi",
"Samvinna",
"Stígamót",
"Tafla",
"Rauði",
"Reitar",
"Festi",
"Bekkur",
"Bakland",
)
)
STREETNAME_BLACKLIST = frozenset(("Mark", "Á", "Sjáland", "Hús", "Húsið"))
# COUNTRY_BLACKLIST = frozenset(())
def article_begin(state: TreeStateDict) -> None:
"""Called at the beginning of article processing"""
session = state["session"] # Database session
url = state["url"] # URL of the article being processed
# Delete all existing locations for this article
session.execute(Location.table().delete().where(Location.article_url == url)) # type: ignore
# Set that will contain all unique locations found in the article
state["locations"] = set()
def article_end(state: TreeStateDict) -> None:
"""Called at the end of article processing"""
locs = state.get("locations")
if not locs:
return
url = state["url"]
session = state["session"]
# Find all placenames mentioned in article
# We can use them to disambiguate addresses and street names
# TODO: Perhaps do this in a more fine-grained manner, at a
# sentence or paragraph level.
placenames = [p.name for p in locs if p.kind == "placename"]
# Get info about each location and save to database
for name, kind in locs:
loc = location_info(name=name, kind=kind, placename_hints=placenames)
loc["article_url"] = url
loc["timestamp"] = datetime.now(timezone.utc)
print(f"Location '{loc['name']}' is a {loc['kind']}")
locmodel = Location(**loc)
session.add(locmodel)
# def paragraph_begin(state, paragraph):
# pass
# def paragraph_end(state, paragraph):
# pass
# def sentence_begin(state, paragraph, sentence):
# pass
# def sentence_end(state, paragraph, sentence):
# pass
def token(
state: TreeStateDict,
paragraph: List[List[TokenDict]],
sentence: List[TokenDict],
token: TokenDict,
idx: int,
) -> None:
"""Called for each token in each sentence. idx is the
index of the token within the sentence."""
if "m" not in token or len(token["m"]) < 3:
return
name = token["m"][0] # Nominative case
fl = token["m"][2] # BÍN category
if fl not in LOCFL and name not in ALWAYS_LOCATION:
return
kind = LOCFL_TO_KIND.get(fl)
# Skip if blacklisted
# if name in GENERAL_BLACKLIST:
# return
if kind == "placename" and name in PLACENAME_BLACKLIST:
return
if kind == "street" and name in STREETNAME_BLACKLIST:
return
# if kind == "country" and name in COUNTRY_BLACKLIST:
# return
# Special handling of addresses
# Check if next token is a house number
if kind == "street" and idx != len(sentence) - 1: # not last token in sentence
next_tok = sentence[idx + 1]
next_word = next_tok["x"]
if "k" in next_tok and (
next_tok["k"] == TOK.NUMBER or next_tok["k"] == TOK.NUMWLETTER
):
name = f"{name} {next_word}"
kind = "address"
# Add
loc = Loc(name=name, kind=kind)
state["locations"].add(loc)