-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.py
executable file
·127 lines (95 loc) · 2.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
#
# IANAPP (i am not a python programmer)
#
import cgi
import os
from sgmllib import SGMLParser
from google.appengine.api import urlfetch
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
from google.appengine.ext.webapp import template
import pprint
class MainPage(webapp.RequestHandler):
def get(self):
template_values = {}
path = os.path.join(os.path.dirname(__file__), 'index.html')
if self.request.get('url'):
template_values['url'] = self.request.get('url')
try:
links = RevCanonical().revcanonical(self.request.get('url'))
if links:
template_values['link'] = links[0]
else:
template_values['link'] = template_values['url']
except Exception, e:
template_values['error'] = e;
self.response.out.write(template.render(path, template_values))
def post(self):
self.get()
class ApiPage(webapp.RequestHandler):
def get(self):
if self.request.get('url'):
url = self.request.get('url')
try:
links = RevCanonical().revcanonical(self.request.get('url'))
if links:
url = links[0]
self.response.out.write(url)
except Exception, e:
self.error(500)
self.response.out.write(e)
else:
self.response.out.write("Takes argument <code>url</code> returns reverse canonicalized URL, if found. Otherwise returns the passed URL.")
def post(self):
pass
class RevCanonical:
def revcanonical(self, url):
resp = urlfetch.fetch(url)
html = resp.content
fragment = len(url.split('#')) > 1 and '#' + url.split('#')[1] or ''
shorts = []
parser = LinkParser()
parser.feed(html)
links = parser.links
for l in links:
for e in l:
if e[0] == 'rel':
if e[1].count('alternate') and e[1].count('short'):
shorts.append(l)
elif e[1].count('short_url'):
shorts.append(l)
elif e[1].count('shorter-alternative'):
shorts.append(l)
elif e[1].count('short_url'):
shorts.append(l)
elif e[1].count('shortlink'):
shorts.append(l)
elif e[0] == 'rev':
if e[1].count('canonical'):
shorts.append(l)
return self.hrefs(shorts, fragment)
def hrefs(self, links, fragment = ''):
hrefs = []
for l in links:
for e in l:
if e[0] == 'href':
hrefs.append(e[1] + fragment)
return hrefs;
class LinkParser(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.links = []
def do_link(self, attrs):
hreflist = [e[1] for e in attrs if e[0]=='href']
if hreflist:
self.links.append(attrs)
def end_head(self, attrs):
self.setnomoretags()
start_body = end_head
application = webapp.WSGIApplication( [('/', MainPage), ('/api', ApiPage)], debug=True)
def main():
run_wsgi_app(application)
if __name__ == '__main__':
main()