-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpublication.py
445 lines (358 loc) · 14.6 KB
/
publication.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/usr/local/bin/python3
"""
Publication objects.
A publication object describes an identified publication in whatever level
of detail we have.
The pub class is intended to be subclassed and different subclasses have
different types of information.
Pubs can come from
- Libraries (like CiteULike or Zotero)
- Alerts (like Google Scholar Emails or RSS feeds)
- A database of previously known pubs.
This would be called an abstract class in C.
"""
import inspect
import re
import ssl
import sys
import urllib.request
SSL_CONTEXT = ssl.SSLContext(ssl.PROTOCOL_TLS)
# Some publishers restrict access if you come in as Python
# Which Publishers? I don't remember.
HTTP_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; "
+ "rv:62.0) Gecko/20100101 Firefox/62.0")
}
# Try using CURL instead, as Wiley responds better to that than it does to
# pretending to be Mozilla
# HTTP_HEADERS = {
# "User-Agent": "curl/7.54.0"
# }
# Nope, that didn't work either.
# Cache any URLs that we have already checked for redirects. Checking for
# redirects is expensive. Don't do it more than once for a URL.
redirect_cache = {}
class Pub(object):
"""An identified publication in whatever level of detail we have.
Can come from many sources (and has equivalent subtypes for those
sources.
"""
def __init__(self):
"""Create an identified publication.
This is only meant to be called by subclass constructors. It is not
meant to be invoked directly.
Many of these fields are overwritten by subclass constructors.
"""
self.title = "" # Unmodified title string
self.canonical_title = None # Lower, no spaces or non-alphanums
self.canonical_doi = None # just the DOI (no http..), lowercase
self.url = None
# Type of pub, if known. e.g., phd thesis, journal article, etc
self.pub_type = None # probably BibTeX
self.authors = "" # Author list. No particular format
# canonical name of first author: last name all lower case.
self.canonical_first_author = None
self.year = None # Publication year
# TODO: maybe this should be a set?
self.tags = None # List of Keywords for this pub
self.journal_name = None # Name if from a journal
self.canonical_journal = None # None if not from a journal
self.ref = "" # reference to pub
self.entry_date = None # Date added to library.
return None
def set_title(self, title):
"""Sets the title and canonical title of a pub."""
self.title = title
self.canonical_title = to_canonical(self.title)
return self.canonical_title
def set_authors(self, authors, canonical_first_author):
self.authors = authors
self.canonical_first_author = canonical_first_author
return None
class PubLibrary(object):
"""A collection of publications.
TODO: Should this inherit from list? Dictionary?
"""
def __init__(self):
"""Create a collection of publications from a given source.
This is only meant to be called by subclass constructors. It is not
meant to be invoked directly.
Many of these fields are overwritten by subclass constructors.
Libraries support several access methods:
lookup by canonical DOI
lookup by given title
lookup by canonical title
"""
self.url = None
self.all_pubs = []
self._by_canonical_title = {}
self._by_canonical_doi = {}
# These are populated by prep_for_reports
self._by_year = None
self._by_tag = None
self._by_journal = None
self._journal_alpha_rank = None
self.journal_pubs_rank = None
return None
def add_pub(self, pub):
"""Add a populated publication to the library."""
self.all_pubs.append(pub)
if pub.canonical_title is None or pub.canonical_title == "":
# everything should have a title
raise AssertionError("Pub has empty title")
# TODO: This will overwrite anything with a duplicate title.
self._by_canonical_title[pub.canonical_title] = pub
# not everything has a doi
if pub.canonical_doi:
self._by_canonical_doi[pub.canonical_doi] = pub
return None
def __len__(self):
"""Return number of pubs in library."""
# Every pub has a title.
return(len(self.all_pubs))
def get_by_canonical_title(self, canonical_title):
"""Given a canonical title string, return the pub for that title.
If the title does not exist in the library then return None.
"""
return self._by_canonical_title.get(canonical_title)
def get_by_given_title(self, given_title):
"""Given and original, messy, mixed case pub title, return the pub.
Returns None if pub title is not in library.
"""
return self._by_canonical_title.get(to_canonical(given_title))
def get_by_canonical_doi(self, canonical_doi):
"""Given a canonical doi, return the pub with that DOI.
If the DOI does not exist in the library, return None.
"""
return self._by_canonical_doi.get(canonical_doi)
def prep_for_reports(self, only_these_tags_path=None):
"""Called by reporting programs to setup a bunch of quick-access
data structures for reporting about the library.
"""
# These are replaced by frozensets at the end.
by_year = {} # unordered array of papers from that year
by_tag = {} # value is unordered array of papers w/ tag
by_journal = {} # unordered list of papers in each journal
sorted_by_journal = [] # sorted by canonical journal name
if only_these_tags_path:
# only pay attention to tags listed in the file.
tag_file = open(only_these_tags_path, "r")
for tag in tag_file:
by_tag[tag.strip()] = []
tag_file.close()
# key is canonical Journal Name; value is alphabetized rank.
self._journal_alpha_rank = {}
for paper in self.all_pubs:
# Process Year
if paper.year == "unknown":
# Fix these when you find them.
print("Year UNKNOWN: " + paper.title, file=sys.stderr)
if paper.year not in by_year:
by_year[paper.year] = []
by_year[paper.year].append(paper)
# Process tags
for tag in paper.tags:
if only_these_tags_path:
if tag in by_tag:
by_tag[tag].append(paper)
else:
if tag not in by_tag:
by_tag[tag] = []
by_tag[tag].append(paper)
if len(paper.tags) == 0:
# should not happen, flag it when it happens.
print("Paper missing tags: " + paper.title, file=sys.stderr)
# Process Journal
jrnl = paper.canonical_journal
if jrnl:
if jrnl not in by_journal:
by_journal[jrnl] = []
sorted_by_journal.append(jrnl)
by_journal[jrnl].append(paper)
# create set versions
self._by_year = {}
for year in by_year:
self._by_year[year] = frozenset(by_year[year])
self._by_tag = {}
for tag in by_tag:
self._by_tag[tag] = frozenset(by_tag[tag])
self._by_journal = {}
for journal in by_journal:
self._by_journal[journal] = frozenset(by_journal[journal])
# create sorted list of Journal names
sorted_by_journal.sort()
for idx in range(len(sorted_by_journal)):
self._journal_alpha_rank[sorted_by_journal[idx]] = idx
# create list of Journal names sorted by most pubs.
self.journal_pubs_rank = sorted(
by_journal.values(),
key=lambda jrnl_pubs: "{:09d}{}".format(
len(self.all_pubs) - len(jrnl_pubs),
jrnl_pubs[0].canonical_journal))
return(None)
def get_pubs(
self,
tag=None,
year=None,
journal=None,
start_entry_date=None,
end_entry_date=None):
"""
Given any combination of tag, year, journal and/or startDate and
endDate, return the only the set of papers that have the sepcified
combination of values.
"""
sets = []
if tag:
sets.append(self._by_tag[tag])
if year:
sets.append(self._by_year[year])
if journal:
sets.append(self._by_journal[journal])
if len(sets) > 1:
selected = sets[0]
for restriction in sets[1:]:
selected = selected.intersection(restriction)
elif len(sets) == 1:
selected = sets[0]
else: # sets is empty
selected = frozenset(self.all_pubs)
# apply date selections if present
if start_entry_date or end_entry_date:
match_dates = []
for paper in selected:
in_so_far = True
if start_entry_date and paper.entry_date < start_entry_date:
in_so_far = False
elif end_entry_date and paper.entry_date > end_entry_date:
in_so_far = False
if in_so_far:
match_dates.append(paper)
selected = match_dates
return(selected)
def get_years(self):
"""Return a list of years that papers were published in,
in chronological order.
"""
return(sorted(self._by_year.keys()))
def get_tags(self):
"""Return a list of tags that have been assigned to pubs. List is
in sorted in alphabetical order.
"""
return(sorted(self._by_tag.keys()))
def gen_tag_url(self, tag):
"""Given a tag, generate a URL that points to collection
of pubs with this tag in the online version of this library.
This method is meant to be overridden by subclasses.
"""
raise NotImplementedError(
inspect.currentframe().f_code.co_name
+ "not implemented by subclass "
+ self.__class__.__name__)
def gen_tag_year_url(self, tag, year):
"""Given a tag and a year, generate a URL thot shows all papers with
that tag published in that year.
This method is meant to be overridden by subclasses.
"""
raise NotImplementedError(
inspect.currentframe().f_code.co_name
+ "not implemented by subclass "
+ self.__class__.__name__)
def is_google_truncated_title(title_text):
"""Given a non-canonical title string, return true if it is a Google
Scholar truncated title.
These titles end with a non-breaking space and an ellipsis.
"""
return title_text.endswith(" …") # that's a non-breaking space.
def trim_google_truncate(title_text):
"""Given a title string, remove the Google truncate string from the end,
if it is there. If it isn't there, then return original text.
"""
new_title = title_text
if title_text.endswith(" …"):
new_title = title_text[0:-3]
elif title_text.endswith(" …"):
new_title = title_text[0:-2]
return new_title
def to_canonical(messy):
"""Convert a messy string to a canonical string.
Canonical strings:
- are all lower case
- have spaces removed
- have non-alphanumeric characters removed.
This does not convert non-ascii chars to ascii, although that is worth
thinking about.
The canonical version of None is None.
"""
if messy:
return re.sub(r'\W+', '', messy).lower()
return None
def is_canonical_doi(given_doi):
"""Return true if given_doi is already in canonical form.
Canonical form means no leading http and starts with 10.
"""
is_canonical = False
if given_doi not in ["", None]:
is_canonical = given_doi.startswith("10.")
return is_canonical
def to_canonical_doi(given_doi):
"""Convert a possibly full, mixed case DOI, to just the DOI, all in
lower case.
Input DOIs can have these formats:
- 10.1016/j.iheduc.2008.03.001
- doi:10.1016/j.iheduc.2008.03.001
- http://dx.doi.org/10.1016/j.iheduc.2008.03.001
- https://doi.org/10.1016/j.iheduc.2008.03.001
And in all sorts of mixed cases.
The goal is to return
10.1016/j.iheduc.2008.03.001
Strip off everything before the leading 10. (The DOI spec allows / in
the DOI, so can't use that.)
The canonical version of
None is None,
the empty string is the empty string
any non-doi string is that string (and a warning gets issued)
"""
doi_only = given_doi
if given_doi:
# DOIs are officially case insensitive.
doi_lower = given_doi.lower()
# if it's a URL (with or without protocol), cut it to just the doi.
# all DOIs start with 10.
doi_start = doi_lower.find("10.")
if doi_start == -1 and doi_lower != "":
print(
"Warning: DOI column not empty, but not a DOI: '" +
given_doi + "'", file=sys.stderr)
print(" Replacing with the empty string.\n", file=sys.stderr)
doi_only = ""
else:
doi_only = doi_lower[doi_start:]
return doi_only
def get_potentially_redirected_url(pub_url):
"""
Some URLs (like DOIs) redirect to another URL. Get that URL, or
return the original URL if it does not redirect.
"""
global redirect_cache
if pub_url:
if pub_url in redirect_cache:
redirect_url = redirect_cache[pub_url]
else:
try:
request = urllib.request.Request(
pub_url, headers=HTTP_HEADERS)
url_response = urllib.request.urlopen(
request, context=SSL_CONTEXT)
redirect_url = url_response.geturl() # different if redirected
except BaseException:
print("Error: {0}".format(sys.exc_info()[0]), file=sys.stderr)
print(
" while processing URL: {0}\n".format(pub_url),
file=sys.stderr)
redirect_url = pub_url
redirect_cache[pub_url] = redirect_url
if redirect_url[-12:] == "cookieAbsent": # Give it up
redirect_url = pub_url
return redirect_url