-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
473 lines (421 loc) · 17.1 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
""" Abstract class for fetching the results from web engines. """
import urllib
import urllib2
import re
import unicodedata
from bibtex import BibTex
from collections import defaultdict
class Fetch(object):
"""
Abstract class handling GET/POST requests.
class variable url: holds main part of the link to web engine
self.refs: holds bibtex string
self.nonunique: true if exactly one result found
self.number: number of results
"""
url = ""
correction = 0
onceMax = 100
more = ""
def __init__(self, otherID=False):
"""
Set main search engine url in subclass.
otherID can be used to instruct fetcher to run other fetchers.
Even if user requests 1 record, it may still be nonunique, since
fetcher might find more than 1 and return just 1.
"""
self.refs = []
self.nonunique = True
self.bibitem = ""
self.number = 0
self.otherID = otherID
def __call__(self, query, count, dct):
"""
Make object act like a function.
Allows for easy pickling and calling the object in multiprocessing.
Returns BibTex and LaTex outputs.
"""
if "website" in dct and dct["website"]:
return '', self.getWebsite(query, count)
else:
self.fetch(query, count)
return self.refs, self.getRefs(**dct)
def fetch(self, query, count=100):
""" Process the query and execute search. """
self.count = count
query = fixQuery(query)
query_dict = self._preprocessQuery(query)
if not query_dict:
self.number = 0
self.refs = ""
self.nonunique = True
return False
query = self._formatQuery(query_dict)
query = urllib.urlencode(query) + "&"
self.query = query
found = 0
allrefs = []
while found < count:
full_url = self.url + query
if self.more:
full_url += self.more + str(found+self.correction)
try:
data = urllib2.urlopen(full_url).read()
self._processResults(data)
self._cleanupBibTex(count)
except:
self.number = 0
if self.number == 0:
break
allrefs.append(self.refs)
found += self.number
if not self.more or found < self.onceMax:
break
self.refs = '\n\n'.join(allrefs)
# result can be nonunique even if user wants to see 1 record
self.nonunique = found > 1
if found > count:
# too many results, truncate
self.refs = '\n\n'.join(self.refs.split('\n\n')[:count])
found = count
self.number = found
if self.number == 0:
self.refs = ""
self.nonunique = True
return False
return True
def getWebsite(self, query, count=100):
""" Execute search but do not postprocess the results. """
query = fixQuery(query)
query_dict = self._preprocessQuery(query)
query = self._formatQuery(query_dict)
query = urllib.urlencode(query) + "&"
full_url = self.website + query
data = urllib2.urlopen(full_url).read()
data = self._cleanWebsite(data)
self.number = 1
self.refs = ""
return data
def _preprocessQuery(self, query):
"""
Turn query into a dict with bibtex style fields.
Handles bibtex, separate fields, and full citation forms of querries.
Each parser can later format it.
"""
if re.match(r"(?si)(\n|\s|\r)*@\w+\{", query):
return self._bibtexQuery(query)
elif re.match(r"(?si)\\(bibitem|text|emph|newblock|bf\s|it\s)", query):
# seems like LaTeX formatted full citation
return self._citationQuery(query)
elif re.match(r"(?si).*\b(\w{2,3}|date|year):", query):
# found a field specifier
return self._fieldsQuery(query)
elif re.match(r"(?si)(.*\n)?\s*(\w:|\d{4,})", query):
# line starts with short query field or date?
return self._fieldsQuery(query)
elif len(query) > 40 and len(query.split("\n")) < 3:
# long query with few lines
return self._citationQuery(query)
else:
# try guessing fields
# if the query is a full citation there should be enough to get it
# as a genral field
return self._fieldsQuery(query)
def _bibtexQuery(self, query):
""" Turn query into bibtex dictionary. """
import bibtexparser
from bibtexparser.bparser import BibTexParser
parser = BibTexParser()
parser.customization = homogeneize_latex_encoding
bib = bibtexparser.loads(query, parser=parser)
if bib.entries:
# only the first record
record = bib.entries[0]
# clean up entries
if "author" in record:
# just last name
record["author"] = re.sub(r',.*?(and\s*|$)', ' ',
record['author'])
if "title" in record:
record["title"] = self._citationQuery(record["title"])[0][1]
if "journal" in record:
record["journal"] = self._citationQuery(record["journal"])[0][1]
if "year" in record:
record["date"] = record["year"]
# only use a few fields
# TODO add numbers
return [(k, v) for k, v in record.items() if k in
{"author", "title", "journal", "mrnumber", "date",
"arxiv", "zbl"}]
else:
return []
def _citationQuery(self, query):
""" Save bibitem and strip formatting from query. """
# get rid of bibitem and save it
match = re.match(
r"\s*(\\bibitem\s*(\[.*?\])?\s*\{(\w|\{(\{.*?\}|[^{}])*?\}|\+)*\})",
query)
if match:
self.bibitem = match.group(1)
query = re.sub(
r"\s*(\\bibitem\s*(\[.*?\])?\s*\{(\w|\{(\{.*?\}|[^{}])*?\}|\+)*\})",
"", query)
# cleanup
# remove links and numbers from the end
query = re.sub(r"(?si)mr\d{6,}", "", query)
query = re.sub(r"(?si)(https?:|\\doi|\\mref|\\arxiv|MR\s*:?\s*\d{4,}|" +
r"zbl\s*\d+|arxiv:?\s*\d+|\\url|\\href).*", "", query)
# remove tex commands
query = re.sub(r"(?si)\\[A-Za-z]{2,}", "", query)
# remove accents, but not {letter}
query = re.sub(r"(?si)\{\\'\{\\i\}\}|\\'\\i\s", "i", query)
query = re.sub(r"(?si)\\c s", "s", query)
query = re.sub(r"(?si)\{\\l\}", "l", query)
query = re.sub(r"(?si)\\\W\s?|\\\w(?=\\|\{)", "",
query).replace('~', ' ')
# remove formulas
query = re.sub(r"(?si)\$.*?\$", "", query)
# remove a few words
query = re.sub(r"(?si)\(electronic\)", " ", query)
query = re.sub(r"(?si)(dedicated\s+to).*", " ", query)
# remove certain characters
query = re.sub(r"[][{}?%#():`]", "", query)
query = re.sub(r"[&=/,']", " ", query)
# remove short words
query = re.sub(r"(?si)\b(and|not|und|art|vol|eds|isbn|inc|the|with)\b",
" ", query)
query = re.sub(r"(?si)\b[A-Za-z]{1,2}\b", " ", query)
# remove all numbers except for years (4 digits)
query = re.sub(r"(?si)\b\d+\s*-+\s*\d+\b", " ", query)
query = re.sub(r"(?si)\b(\d{1,3}|\d{5,})\b", " ", query)
# remove preprint and other endings
query = re.sub(r"(?si)\b(preparation|arxiv|preprint|to\s+appear|" +
r"submitted|translated\s+from).*",
" ", query)
# remove publisher names, places and some other words
query = re.sub(r"Birkhauser|Springer|New York|Paris|Heidelberg|" +
r"Boston|Basel|Grenoble|Verlag",
"", query)
# short word with . is most likely an abbreviation
query = re.sub(r"(\b[a-zA-Z]{,4})\.", r"\1*", query)
# remove a few more characters
query = re.sub(r"[-.]", " ", query)
query = re.sub(r"\\", "", query)
query = re.sub(r"\s+\*+(\s+|$)", " ", query)
# remove extra white characters
query = re.sub(r"(?si)([\s\n\r]+)", " ", query).strip()
year = re.findall(r'\d{4}', query)
if year and len(year) == 1:
# extract year as separate field
query = re.sub(r'\d{4}', ' ', query)
return [('none', query), ('date', year[0])]
else:
return [('none', query)]
def _fieldsQuery(self, query):
r"""
Detect fields and form a list.
List format:
['', # empty
then repeated
') and \n not (', # parentheses and logic with new line
('field', 'value'),
The same query can contain many values for the same field.
"""
# cleanup
# remove tex commands
query = re.sub(r"(?si)\\[A-Za-z]{2,}", "", query)
# remove accents, but not {?}
query = re.sub(r"(?si)\\\W|\\\w(?=\\|\{)", "", query).replace('~', ' ')
# remove formulas
query = re.sub(r"(?si)\$.*?\$", "", query)
# remove {}&?%=/#.
query = re.sub(r"[{}&?%=/#.]", "", query)
# TODO pyparse could make this easier
# try to find the fields by adding \n before each field indicator
query = re.sub(r"(?<=[\s()])([a-zA-Z]{2,3}|date|year|type):",
r"\n\1:", query)
# start with new line to ensure parentheses/logic before the first query
query = '\n' + query
# split preserving and/or/not/(/) at the boundaries of the fields/lines
# this allows for rebuilding of a complex query with parentheses
lines = re.split(
r"(?si)((?:[\s()]|and|not|or)*" + # before new line
r"(?:\n+|$)" + # new line or end
r"(?:[\s()]|and|not|or)*)", # after new line
query)
lst = []
for line in lines:
if re.match(r"(?si)([\s()\n]|and|or|not)*$", line):
# parentheses and/or logic
lst.append(line)
continue
# detect date (range) with or without field
date = re.match(
r"(?si)((?:py|yr|dt|date|year):[\D]*?)?" + # field or not
r"([<=>]?\s*\d{4}(\s*-+\s*\d{4}|(\b\d{4}\b|[,\s])+)?)", # dates
line)
author = re.match(r"(?si)(a|au|aut[hors]*):(?P<c>.*)", line)
journal = re.match(
r"(?si)(j|jo|jou[rnal]*|s|so|sou[rce]|jr*):(?P<c>.*)", line)
title = re.match(r"(?si)(t|ti|tit[le]*):(?P<c>.*)", line)
if date:
lst.append(("date", date.group(2)))
elif re.match(r"type:|ty:|\s*(not\s)?\s*(book|journal|proceeding)",
line):
line = self._publicationType(line)
if line:
lst.append(("type", line))
elif author:
author = author.group("c").strip()
author = re.sub(r"(\w{2,},\s+\w)(?=\s|$)", r"\1*", author)
lst.append(("author", author))
elif journal:
lst.append(("journal", journal.group("c").strip()))
elif title:
lst.append(("title", title.group("c").strip()))
elif re.match(r"(any|all|^):", line):
# all fields search
lst.append(("all", re.sub(r".*?:\s*", "", line)))
elif re.match(r"\w{2,3}:", line):
# unrecognized field
m = re.match(r"(?si)(\w{2,3}):\s*(.*)$", line)
lst.append(m.group(1, 2))
elif re.match(r"(?si)\s*\w+,\s+\w(\s|\*|$)", line):
# author without field specification
line = (line + '*').replace('**', '*')
lst.append(("author", '"'+line+'"'))
else:
# something
lst.append(("none", line))
return lst
def _publicationType(self, line):
"""
Format type of the source according to zbMATH style.
Publication types: journal/book/(book article=proceedings).
"""
line = line.replace("type:", "").replace("ty:", "") \
.replace("book article", "a") \
.replace("proceeding", "a").replace("book", "b") \
.replace("journal", "j").replace("s", "") \
.replace("not", "!").replace("and", '|').replace("or", "|") \
.replace("p", "a")
line = re.sub(r"\s+", "", line)
line = re.sub(r"[^abj|!&]", "", line)
line = re.sub(r"\B", "|", line)
if re.match("(!?[abj])([!|&][abj])?([!|&][abj])?$", line):
# we have a zbl formatted type
return line
else:
return ""
def _formatQuery(self, query_dict):
"""
Query formatting should be implemented in subclasses.
The resulting format should be a string ready for URL inclusion.
Each field in query_dict is a list of strings.
"""
pass
def _processResults(self, data):
"""
Result processing should be implemented in subclasses.
Valid BibTex entries must be placed in self.refs.
Number of records should be placed in self.number.
"""
pass
def getRefs(self, **format_dict):
"""
Return the list of fetched and formatted references.
Run BibTex if necessary.
"""
# nonexistent key get value "" (also evaluates to False)
dct = defaultdict(str)
try:
dct.update(format_dict)
except:
pass
if dct["bibtexOut"]:
return self.refs
if self.bibitem and dct["keepBibitems"]:
# bibitem exists and should be kept, so do not generate
dct["genBibitems"] = False
if not dct["keepBibitems"] or dct["genBibitems"]:
# remove saved bibitem if should be generated or not kept
self.bibitem = ""
return self._processBibTex(dct)
def _processBibTex(self, format_dict):
"""
Execute BibTex with options taken from format dictionary.
Returns formatted string with citations.
"""
try:
bib = BibTex(**format_dict)
formatted = bib.run(self.refs)
assert formatted is not ""
formatted = self.bibitem + formatted
return formatted
except:
# bibtex failed, return unformatted bibtex entries
return self.refs
def _cleanupBibTex(self, count):
""" Clean up bibtex and ensure uniform look. """
import bibtexparser
from bibtexparser.bparser import BibTexParser
parser = BibTexParser()
parser.customization = homogeneize_latex_encoding
bib = bibtexparser.loads(self.refs, parser=parser)
# save results
from bibtexparser.bwriter import BibTexWriter
writer = BibTexWriter()
writer.contents = ['entries']
writer.indent = ' '
writer.order_entries_by = ('id')
self.number = len(bib.entries)
self.refs = bibtexparser.dumps(bib, writer)
def setBibtex(self, bibstr):
"""
Use ready bibtex string in the fetcher.
Can be used to get formatted references out of the fetcher.
"""
bibstr = fixQuery(bibstr)
self.refs = str(bibstr)
self._cleanupBibTex(None)
def fixQuery(query):
""" Turn unicode or str into nice str. """
try:
if isinstance(query, str):
# string may be encoded
# should probably assume utf-8
# but assume Eastern Europe :)
query = unicode(query, 'cp1250')
# Polish \l may end up as the below characters
query = query.replace(u"\xc2\u0142", "l") \
.replace(u"\xc2\u0141", "L")
# eliminate all strange characters
# again handle Polish \l
# query better be unicode by now
query = unicodedata.normalize('NFKD', query) \
.replace(u'\u0141', 'L').replace(u'\u0142', 'l') \
.encode('ascii', 'ignore')
except:
# what to do?
query = unicode(query, errors='ignore')
return query
# modified bibtexparser.latexenc content
from bibtexparser.latexenc import protect_uppercase, unicode_to_latex_map
def string_to_latex(string):
""" Convert a string to its latex equivalent. """
escape = [' ', '{', '}']
new = []
for char in string:
if char in escape or ord(char) < 128:
new.append(char)
else:
new.append(unicode_to_latex_map.get(char, char))
return ''.join(new)
def homogeneize_latex_encoding(record):
""" Replace accents with latex equivalents. """
for val in record:
if val not in ('ID',):
record[val] = string_to_latex(record[val])
if val == 'title':
record[val] = protect_uppercase(record[val])
return record