forked from dongweiming/Mtime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
165 lines (143 loc) · 5.29 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# coding=utf-8
'''
爬虫
'''
import zlib
import urllib
import urllib2
import cookielib
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
from gzip import GzipFile
from datetime import datetime
from collections import OrderedDict
from utils import get_user_agent
from log import debug
# deflate support
def deflate(data):
try:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
class ContentEncodingProcessor(urllib2.BaseHandler):
'''A handler to add gzip capabilities to urllib2 requests'''
cookiejar = None
def __init__(self, cookie_support, additional_headers):
self.additional_headers = additional_headers
if cookie_support:
self.cookiejar = cookielib.CookieJar()
def http_request(self, req):
# 默认的头信息
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('User-Agent', get_user_agent())
req.add_header('Accept-Language',
'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
if self.additional_headers is not None:
req.headers.update(self.additional_headers)
if self.cookiejar is not None:
self.cookiejar.add_cookie_header(req)
return req
def http_response(self, req, resp):
if self.cookiejar is not None:
self.cookiejar.extract_cookies(resp, req)
# 页面没有压缩,直接返回,比如调用API返回JSON数据
if resp.headers.get("content-encoding") not in ('gzip', 'deflate'):
return resp
old_resp = resp
content = resp.read()
# gzip
if resp.headers.get("content-encoding") == "gzip":
gz = GzipFile(
fileobj=StringIO(content),
mode="r"
)
# deflate
elif resp.headers.get("content-encoding") == "deflate":
gz = StringIO(deflate(content))
resp = urllib2.addinfourl(
gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
class Spider(object):
def __init__(self, cookie_support=True, additional_headers=None,
params={}):
self.cookie_support = cookie_support
self.additional_headers = additional_headers
self.params = params
def make_query(self):
'''基本队列'''
return {}
def fetch(self, url):
debug('Fetch Url: {} start...'.format(url))
opener = urllib2.build_opener(
ContentEncodingProcessor(self.cookie_support,
self.additional_headers),
urllib2.HTTPHandler)
urllib2.install_opener(opener)
params = urllib.urlencode(self.make_query())
if params:
url = '{}?{}'.format(url, params)
req = urllib2.Request(url)
self.content = urllib2.urlopen(req).read()
debug('Fetch Url: {} done'.format(url))
@classmethod
def get_timestamp(cls):
now = datetime.now()
timestamp = ''
for i in (now.year, now.month, now.day, now.hour, now.minute,
now.second, str(now.microsecond)[:5]):
timestamp += str(i)
return timestamp
class Search(Spider):
'''搜索电影用的爬虫'''
def make_query(self):
params = self.params
if not isinstance(params, OrderedDict):
d = OrderedDict()
d['Ajax_CallBack'] = params['Ajax_CallBack']
d['Ajax_CallBackType'] = params['Ajax_CallBackType']
d['Ajax_CallBackMethod'] = params['Ajax_CallBackMethod']
d['Ajax_CrossDomain'] = params['Ajax_CrossDomain']
d['Ajax_RequestUrl'] = params['Ajax_RequestUrl']
d['t'] = self.get_timestamp()
for i in range(20):
param = 'Ajax_CallBackArgument' + str(i)
d[param] = params.get(param, 0)
return d
else:
return params
class Movie(Spider):
def make_query(self):
params = self.params
if not isinstance(params, OrderedDict):
# TODO 优化,从beat剥离
d = OrderedDict()
d['Ajax_CallBack'] = True
service = 'Mtime.Community.Controls.CommunityPages.DatabaseService'
d['Ajax_CallBackType'] = service
d['Ajax_CallBackMethod'] = 'LoadData2'
d['Ajax_CrossDomain'] = 1
d['Ajax_RequestUrl'] = params['Ajax_RequestUrl']
d['Ajax_CallBackArgument0'] = 1
d['Ajax_CallBackArgument1'] = params['Ajax_CallBackArgument1']
return d
else:
return params
class Comment(Spider):
def make_query(self):
params = self.params
if not isinstance(params, OrderedDict):
d = OrderedDict()
d['Ajax_CallBack'] = True
d['Ajax_CallBackType'] = 'Mtime.Library.Services'
d['Ajax_CallBackMethod'] = 'GetMovieReviewAndTweetCountInfo'
d['Ajax_CrossDomain'] = 1
d['Ajax_RequestUrl'] = params['Ajax_RequestUrl']
d['t'] = self.get_timestamp()
d['Ajax_CallBackArgument0'] = params['Ajax_CallBackArgument0']
d['Ajax_CallBackArgument1'] = params['Ajax_CallBackArgument1']
return d
else:
return params