-
Notifications
You must be signed in to change notification settings - Fork 0
/
urlfunctions.py
138 lines (113 loc) · 4.07 KB
/
urlfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#! /usr/bin/python3
# vim: set expandtab tabstop=4 shiftwidth=4 :
"""Module with functions wrapping urllib"""
import http.client
import urllib.request
import urllib.parse
import ssl
import gzip
import inspect
import logging
import json
import os
import time
from bs4 import BeautifulSoup
# Avoid flooding the server with requests
DELAY_BEFORE_REQUEST = 1
def log(string):
"""Dirty logging function."""
# TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes
# we do not need to retrieve the function name manually
logging.debug(inspect.stack()[1][3] + " " + string)
def urlopen_wrapper(url, referer=None):
"""Wrapper around urllib.request.urlopen (user-agent, etc).
url is a string
referer is an optional string
Returns a byte object."""
log('(url : %s)' % url)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30'
try:
time.sleep(DELAY_BEFORE_REQUEST)
req = urllib.request.Request(
url,
headers={'User-Agent': user_agent, 'Accept': '*/*'})
if referer:
req.add_header('Referer', referer)
response = urllib.request.urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
return gzip.GzipFile(fileobj=response)
return response
except (urllib.error.HTTPError,
http.client.RemoteDisconnected,
urllib.error.URLError,
ConnectionResetError,
ssl.CertificateError) as e:
print("Exception %s for url %s" % (e, url))
raise
def get_content(url):
"""Get content at url.
url is a string
Returns a string"""
log('(url : %s)' % url)
try:
return urlopen_wrapper(url).read()
except http.client.IncompleteRead as e:
print("%s for %s" % (e, url))
return e.partial
class JsonCache(object):
def __init__(self, filepath):
self.filepath = filepath
self.data = self.get_data_from(self.filepath)
def get(self, key, default=None):
return self.data.get(key, default)
def __contains__(self, item):
return item in self.data
def __getitem__(self, key):
return self.data[key]
def __setitem__(self, key, value):
if value is self.data and self.data[key] == value:
return
self.data[key] = value
self.on_update()
def on_update(self):
self.save_data_in(self.data, self.filepath)
@staticmethod
def get_data_from(json_filepath):
try:
with open(json_filepath) as f:
return json.load(f)
except IOError:
return dict()
@staticmethod
def save_data_in(data, json_filepath):
with open(json_filepath, 'w+') as f:
json.dump(data, f, indent=4, sort_keys=True)
@staticmethod
def save_data_in_tmp(data):
"""Unrelated function. Can be useful for debugging purposes."""
import tempfile
f = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.json')
print(f.name)
json.dump(data, f, indent=4, sort_keys=True)
class UrlCache():
def __init__(self, folder):
self.folder = folder
os.makedirs(self.folder, exist_ok=True)
self.jsonCache = JsonCache(os.path.join(self.folder, "urlCache.json"))
def get_content(self, url):
rel_name = self.jsonCache.get(url, None)
if rel_name is not None:
with open(os.path.join(self.folder, rel_name), "rb") as f:
return f.read()
content = get_content(url)
rel_name = str(hash(url)) + "_" + "".join(c if c.isalnum() else "-" for c in url)
with open(os.path.join(self.folder, rel_name), "wb") as f:
f.write(content)
self.jsonCache[url] = rel_name
return content
def get_soup(self, url):
content = self.get_content(url)
return BeautifulSoup(content, "html.parser")
def get_json(self, url):
content = self.get_content(url)
return json.loads(content.decode())