-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGoogleScholar.py
186 lines (147 loc) · 5.7 KB
/
GoogleScholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
#
# Information about alert from Google Scholar
import re
import alert
import quopri # Quoted printable encoding
import html.parser
import urllib.parse
SENDER = "[email protected]"
ELLIPSIS_TAIL = " " + chr(8230) #" …"
ELLIPSIS_TAIL_LEN = len(ELLIPSIS_TAIL)
#SD_JHU_PII_URL = "http://www.sciencedirect.com.proxy1.library.jhu.edu/science/article/pii/"
#SD_PII_URL = "http://www.sciencedirect.com/science/article/pii/"
class Paper(alert.PaperAlert, html.parser.HTMLParser):
"""
Describe a particular paper being reported by Google Scholar
"""
def __init__(self):
"""
"""
super(alert.PaperAlert,self).__init__()
html.parser.HTMLParser.__init__(self)
self.title = ""
self.titleTruncated = False
self.authors = ""
self.source = ""
self.doiUrl = ""
self.doi = ""
self.url = ""
self.hopkinsUrl = ""
self.search = "Google "
return None
def getFirstAuthorLastName(self):
"""
DM Meinel, G Margos, R Konrad, S Krebs, H Blum ...
This will mess up on van Drysdale etc.
"""
if self.authors:
return(self.authors.split(",")[0].split(" ")[-1])
else:
return None
def getFirstAuthorLastNameLower(self):
firstAuthor = self.getFirstAuthorLastName()
if firstAuthor:
firstAuthor = firstAuthor.lower()
return firstAuthor
def titleIsTruncated(self):
return self.titleTruncated
class Email(alert.Alert, html.parser.HTMLParser):
"""
All the information in a Google Scholar Email alert.
Parse HTML email body from Google Scholar. The body may be reporting more
than one paper.
"""
searchStartRe = re.compile(r'Scholar Alert: ')
def __init__(self, email):
html.parser.HTMLParser.__init__(self)
self.papers = []
self.search = "Google "
self.currentPaper = None
self.inSearch = False
self.inTitleLink = False
self.inTitleText = False
self.inAuthorList = False
# Google Scholar email body content is Quoted Printable encoded. Decode it.
emailBodyText = quopri.decodestring(email.getBodyText())
self.feed(emailBodyText.decode('utf-8')) # process the HTML body text.
return None
def handle_data(self, data):
data = data.strip()
startingSearch = Email.searchStartRe.match(data)
if startingSearch:
self.search += data
self.inSearch = True
elif self.inSearch:
self.search += " " + data
elif self.inTitleText and data:
# sometimes we lose space between too parts of title.
if self.currentPaper.title and self.currentPaper.title[-1] != " ":
self.currentPaper.title += " "
self.currentPaper.title += data
"""
if self.currentPaper.title[- ELLIPSIS_TAIL_LEN:] == ELLIPSIS_TAIL:
# clip it, title will be only a partial match.
self.currentPaper.title = self.currentPaper.title[0:- ELLIPSIS_TAIL_LEN]
self.currentPaper.titleTruncated = True
print("Clipped: " + self.currentPaper.title)
"""
# Fix title, stripping thing yattag can't cope with.
self.currentPaper.title = self.currentPaper.title
elif self.inAuthorList and data:
# Author list may also have source at end
parts = data.split("- ")
self.currentPaper.authors += parts[0].strip()
if len(parts) == 2:
self.currentPaper.source = parts[1]
return(None)
def handle_starttag(self, tag, attrs):
if tag == "h3":
# link to paper is shown in h3.
self.inTitleLink = True
self.currentPaper = Paper()
self.papers.append(self.currentPaper)
self.currentPaper.search = self.search
elif tag == "a" and self.inTitleLink:
fullUrl = attrs[0][1]
urlArgs = fullUrl[fullUrl.find("?")+1:].split("&")
for urlArg in urlArgs:
#print("URL_ARG: " + urlArg)
if urlArg[0:2] == "q=":
# need to get rid of URL encoding.
self.currentPaper.url = urllib.parse.unquote(urlArg[2:])
#print("Q URL Uncoded: " + urllib.unquote(urlArg[2:]))
break
elif urlArg[0:4] == "url=":
self.currentPaper.url = urllib.parse.unquote(urlArg[4:])
#print("URL URL Uncoded: " + urllib.unquote(urlArg[4:]))
break
self.inTitleLink = False
self.inTitleText = True
return (None)
def handle_endtag(self, tag):
if tag == "b" and self.inSearch:
self.inSearch = False
elif tag =="a" and self.inTitleText:
self.inTitleText = False
elif tag == "h3":
self.inAuthorList = True
elif tag == "div" and self.inAuthorList:
self.inAuthorList = False
return (None)
def handle_startendtag(self, tag, attrs):
"""
Process tags like IMG and BR that don't have end tags.
"""
return(None)
def getPapers(self):
"""
Return list of referencing papers in this alert.
"""
return(self.papers)
def getSearch(self):
"""
Returns text identifying what web os science search this alert is for.
"""
return(self.search)