-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbpedia.py
254 lines (180 loc) · 6.09 KB
/
dbpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import time
import pickle
from SPARQLWrapper import SPARQLWrapper, JSON
def getEnglishURI(URI):
"""Find an English URI from one from another language."""
global EN_URIdict
if URI in EN_URIdict:
return EN_URIdict[URI]
else:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
SELECT ?a
WHERE {
?a owl:sameAs <%s>.
}
""" % URI )
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
time.sleep(.05) #to not nuke the DBpedia endpoint.
for result in results["results"]["bindings"]:
URI_EN = result["a"]["value"]
EN_URIdict[URI] = URI_EN
return(URI_EN)
def getsubjects(URI_EN):
"""
Returns all values from DBpedia categories dct:subject as a list.
"""
if URI_EN in subjects_URIdict:
return subjects_URIdict[URI_EN]
else:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
SELECT ?a
WHERE {
<%s> dct:subject ?a.
}
""" % URI_EN )
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
time.sleep(.05) #to not nuke the DBpedia endpoint.
subjectlist = []
for result in results["results"]["bindings"]:
subjectlist.append((result["a"]["value"]))
subjects_URIdict[URI_EN] = subjectlist
return subjectlist
def addEnglishURItodict(d):
"""
Adds general DBpedia URI to dictionary under key 'URI_EN'
"""
for URI_NL in d.keys():
URI_EN = getEnglishURI(URI_NL)
d[URI_NL]['URI_EN'] = URI_EN
return d
def addsubjecttodict(d):
"""
Adds DBpedia subjects to dictionary under key 'subjects'
"""
for URI_NL in d.keys():
URI_EN = d[URI_NL]['URI_EN']
subjects = getsubjects(URI_EN)
d[URI_NL]['subjects'] = subjects
return d
def getbroader(URI, endpoint):
"""
Local version.
Returns the broader concepts (skos: broader) from
an URI from the DBpedia dump.
"""
global URI_broaderdict
if URI in URI_broaderdict.keys():
broaderlist = URI_broaderdict[URI]
else:
broaders = endpoint.query("""
SELECT ?a
WHERE {
<%s> <http://www.w3.org/2004/02/skos/core#broader> ?a.
}
""" % URI )
broaderlist = []
for item in broaders:
broaderlist.append('%s' % item)
URI_broaderdict[URI] = broaderlist
return broaderlist
def findtoplevel(URI, top_level, endpoint, maxdepth):
"""
Returns a list of routes to a value in the top_level list.
"""
depth = 1
route = []
route.append(URI)
# get all broaders for the URI subject
broaderlist = getbroader(URI, endpoint)
# if the top level is reached
if not top_level.isdisjoint(broaderlist):
hit = list(set(top_level) & set(broaderlist))
return hit
else:
return recursive(broaderlist, route, top_level, depth, endpoint, maxdepth)
def recursive(URI_list, route, top_level, depth, endpoint, maxdepth):
"""
Recursive function to be used in findtoplevel().
Returns a list containing the route of URI's to the
most abstract discipline.
"""
# counter
if depth > maxdepth:
return ''
else:
depth += 1
setofbroader = set()
setofbroader.clear()
results = []
for broader in URI_list:
broaderlist = getbroader(broader, endpoint)
route.append(broader)
# if the URI shares broaders with the top_level list, add it
if not top_level.isdisjoint(broaderlist):
hit = list(set(top_level) & set(broaderlist))
route.append(hit)
#return route
results.append(route)
else:
setofbroader.update(broaderlist)
#print(setofbroader)
if results:
return results
else:
try:
return recursive(setofbroader, route, top_level, depth, endpoint, maxdepth)
except:
return ''
def makeroute(URI_NL, top_level, d, endpoint, maxdepth):
"""
IN: URI_NL
OUT: route of URI's as list
"""
resultlist = []
#URI_EN = getEnglishURI(URI_NL)
URI_EN = d[URI_NL]['URI_EN']
list_of_subjects = d[URI_NL]['subjects']
for subject in list_of_subjects:
resultlist.append(findtoplevel(subject, top_level, endpoint, maxdepth))
#return findtoplevel(subject, top_level)
return resultlist
def loadURIdatasets():
"""Load saved dbpedia data for quicker lookup."""
global URI_broaderdict
global EN_URIdict
global subjects_URIdict
try:
with open('datasets/URI_broaderdict.pickle', 'rb') as infile:
URI_broaderdict = pickle.load(infile)
print("Loaded URI_broaderdict!")
except:
URI_broaderdict = dict()
try:
with open('datasets/EN_URIdict.pickle', 'rb') as infile:
EN_URIdict = pickle.load(infile)
print("Loaded EN_URIdict!")
except:
EN_URIdict = dict()
try:
with open('datasets/subjects_URIdict.pickle', 'rb') as infile:
subjects_URIdict = pickle.load(infile)
print("Loaded subjects_URIdict!")
except:
subjects_URIdict = dict()
def dumpURIdatasets():
"""
Store acuired dbpedia data into pickles for future use (=quicker).
"""
global URI_broaderdict
global EN_URIdict
global subjects_URIdict
with open('datasets/URI_broaderdict.pickle', 'wb') as outfile:
pickle.dump(URI_broaderdict, outfile)
with open('datasets/EN_URIdict.pickle', 'wb') as outfile:
pickle.dump(EN_URIdict, outfile)
with open('datasets/subjects_URIdict.pickle', 'wb') as outfile:
pickle.dump(subjects_URIdict, outfile)