-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
135 lines (115 loc) · 4.3 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests, json
from urllib.parse import urlencode
import re, string
from nltk.tag.stanford import POSTagger
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
'UTF-8')
punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))
def get_potential_places(article_place, article_body):
"""
Returns a list of potential places as tuples with their part-of-speech tags
for later filtering
"""
place_pos = tagger.tag(punctuation_regex.sub(" ", article_place).split())
text_pos = tagger.tag(punctuation_regex.sub(" ", article_body).split())
# extract the places out of the full text
places = [place_pos]
is_matching = False
current_match = []
for tuple in text_pos:
if is_matching:
# when we're matching, the phrases we're looking for look like
# "Im S-Bahnhof Wedding"... the tags below mean
if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
current_match.append(tuple)
else:
# we stop the match, so append the current match
places.append(current_match)
current_match = []
# whe we're looking at a preposition again, just start new match
if tuple[1] not in ("APPR", "APPRART"):
is_matching = False
else:
# start matching when we have a preposition
if tuple[1] in ("APPR", "APPRART"):
is_matching = True
return places
def improve_potential_places(pos_tuples):
"""
Improves the matches' quality so we don't have to look up the lat-lng of so
many mismatches
"""
better_tuples = []
for tuple_list in pos_tuples:
# first, exluce empty lists
if tuple_list:
cleaner_list = []
index = -1
for tuple in tuple_list:
index += 1
# exclude articles ("the", "a") beginning the phrase, they only
# introduce noise, but keep the list as a whole
if tuple[1] == "ART" and index == 0:
continue
# if we have numbers in the middle of our phrase, probably the
# whole list is not useful (as opposed to e.g. Krügerstr. 22)
if tuple[1] == "CARD" and index < len(tuple_list):
cleaner_list = []
break
cleaner_list.append(tuple)
if cleaner_list:
better_tuples.append(cleaner_list)
return better_tuples
def get_categories(article_body):
"""
Gives a list of categories an article falls into, which is empty if none of
the following are matched:
- sexism
- antisemitism
- homophobia
- racism
"""
bad_words = {
'antisemit': 'antisemitism',
'jud': 'antisemitism',
'jüd': 'antisemitism',
'homo': 'homophobia',
'schwul': 'homophobia',
'lesb': 'homophobia',
'trans': 'homophobia',
'sexis': 'sexism',
'frauenfeind': 'sexism',
'rassis': 'racism',
'fremdenfeind': 'racism',
'flüchtling': 'racism',
'migrant': 'racism'
}
found_categories = set([bad_words[key] for key in bad_words
if key in article_body.lower()])
return found_categories
def get_geoloc(query):
confidence_map = {
"ROOFTOP": 10,
"RANGE_INTERPOLATED": 7,
"GEOMETRIC_CENTER": 4,
"APPROXIMATE": 1
}
params = {
"address": query + ", Berlin",
"bounds": "52.6754542,13.7611176|52.33962959999999,13.0891553",
"components": "country:DE",
"sensor": False
}
url = "http://maps.googleapis.com/maps/api/geocode/json?" + urlencode(params)
r = requests.get(url).json()["results"]
locations = []
for location in r:
print(location)
locations.append({
"lat": location["geometry"]["location"]["lat"],
"lng": location["geometry"]["location"]["lng"],
"confidence": confidence_map[location["geometry"]["location_type"]],
"returned_place": location["formatted_address"]
})
return locations