-
Notifications
You must be signed in to change notification settings - Fork 10
/
sample_and_camera.py
197 lines (171 loc) · 8.5 KB
/
sample_and_camera.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: UTF-8 -*-
import random
import re
import sys
from urllib2 import Request, urlopen
import xlwt
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
urls = {
"2019": u'http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s8010_1_1_0_1.html',
"2018": u"http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s7500_1_1_0_1.html",
"2017": u"http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s7235_1_1_0_1.html",
"2016": u"http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s6472_1_1_0_1.html",
# "2015": u"http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s6132_1_1_0_1.html", #格式不一样,屏了
# "2014": u"http://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s5359_1_1_0_1.html"
}
def zol_spider(year):
wb_name = '%s.xls' % year
wb = xlwt.Workbook(encoding="utf-8")
sheet = wb.add_sheet("zol", cell_overwrite_ok=True)
title_index = { # 索引参数的列
'机型': 0,
'价格': 1,
'4G网络': 2,
'屏幕': 3,
'CPU': 4,
'主频': 5,
'电池': 6,
'操作系统': 7,
'RAM': 8,
'ROM': 9,
'主摄像头': 10,
# 以上是概要列表页的基础信息,不要动。
'摄像头总数': 11, # 左边键可以随便起,好记即可,右边的数字11对应 上面的 摄像头总数。
'前置摄像头': 12,
'传感器': 13,
'闪光灯': 14,
'光圈': 15,
'焦距': 16,
'广角': 17,
'视频拍摄': 18,
'摄像头认证': 19,
'摄像头特色': 20,
'拍照功能': 21,
'其他摄像头参数': 22,
}
if len(title_index) != len(set(title_index)):
raise ValueError('titles has duplicates.')
for __column in title_index:
sheet.write(0, title_index[__column], __column)
wb.save(wb_name)
rows = 1 # excel 行数索引
detail_domain = "http://detail.zol.com.cn"
head = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
url = urls[year]
req = Request(url, headers=head)
response = urlopen(req)
html = response.read().decode('gbk')
# print html
soup = BeautifulSoup(html, 'html.parser')
total_page_area = soup.find('div', class_="page_total") # 获取页面区域的信息
__pages = re.findall(u"/(\d*) 页", total_page_area.text) # 获取总页码
if len(__pages) == 1:
total_page = int(__pages[0])
print "Total pages: %s" % total_page
else:
print 'get total pages failed.total %s' % len(__pages)
sys.exit(-1)
# 生成所有待爬的网页
url_templet = url.replace('1.html', '')
unknown_list = []
for each_page in range(total_page): # 遍历,开爬
print "page: ", each_page + 1
per_url = "%s%s%s" % (url_templet, each_page + 1, ".html")
req = Request(per_url, headers=head)
response = urlopen(req)
html = response.read().decode('gbk')
soup = BeautifulSoup(html, 'html.parser')
result_frame = soup.find("ul", class_="result_list") # 包含搜索信息的那个框架
phones = result_frame.find_all("li") # 匹配出单个手机的信息
for phone_content in phones:
try: # 获取价格
phone_name = phone_content.find("dl", class_="pro_detail").find("a").text
phone_price = phone_content.find("div", class_="date_price").find("b", class_="price-type").text
sheet.write(rows, title_index['机型'], phone_name)
sheet.write(rows, title_index['价格'], phone_price)
except:
continue
details = phone_content.find_all("li")
for i in details:
if u'4G网络' in str(i):
sheet.write(rows, title_index['4G网络'], i["title"])
elif u'主屏尺寸' in str(i):
sheet.write(rows, title_index['屏幕'], i["title"])
elif u'CPU型号' in str(i):
sheet.write(rows, title_index['CPU'], i["title"])
elif u'CPU频率' in str(i):
sheet.write(rows, title_index['主频'], i["title"])
elif u'电池容量' in str(i):
sheet.write(rows, title_index['电池'], i["title"])
elif u'出厂系统' in str(i):
sheet.write(rows, title_index['操作系统'], i["title"])
elif u'RAM容量' in str(i):
sheet.write(rows, title_index['RAM'], i["title"])
elif u'ROM容量' in str(i):
sheet.write(rows, title_index['ROM'], i["title"])
elif u'后置摄像' in str(i):
sheet.write(rows, title_index['主摄像头'], i["title"])
detail_url = phone_content.find("a", target="_blank")["href"]
phone_detail_url = detail_domain + detail_url
req = Request(phone_detail_url, headers=head)
response = urlopen(req)
html = response.read().decode('gbk')
soup = BeautifulSoup(html, 'html.parser')
# 以下是获取摄像头表格的代码↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
tds = soup.find('td', class_="hd", text=u'摄像头') # 表格标题
try:
camera_area = tds.parent.parent # 摄像头总表格
except:
print "can not get camera info: ", phone_detail_url
rows += 1
continue
for tr in camera_area.find_all('tr'):
try:
if tr.th.text == u'摄像头总数':
sheet.write(rows, title_index['摄像头总数'], tr.td.span.contents[0])
elif tr.th.text == u'前置摄像头':
sheet.write(rows, title_index['前置摄像头'], tr.td.span.contents[0])
elif tr.th.text in [u'传感器类型', u'传感器型号']:
sheet.write(rows, title_index['传感器'], tr.td.span.contents[0])
elif tr.th.text == u'闪光灯':
sheet.write(rows, title_index['闪光灯'], tr.td.span.contents[0])
elif tr.th.text == u'焦距/范围':
sheet.write(rows, title_index['焦距'], tr.td.span.contents[0])
elif tr.th.text in [u'光圈', u'\n光圈\n']:
sheet.write(rows, title_index['光圈'], tr.td.span.contents[0])
elif tr.th.text in [u'广角']:
sheet.write(rows, title_index['广角'], tr.td.span.contents[0])
elif tr.th.text == u'视频拍摄':
sheet.write(rows, title_index['视频拍摄'], tr.td.span.contents[0])
elif tr.th.text == u'拍照功能':
sheet.write(rows, title_index['拍照功能'], tr.td.span.text)
elif tr.th.text == u'摄像头认证':
sheet.write(rows, title_index['摄像头认证'], tr.td.span.text)
elif tr.th.text == u'摄像头特色':
sheet.write(rows, title_index['摄像头特色'], tr.td.span.text)
elif tr.th.text == u'其他摄像头参数':
sheet.write(rows, title_index['其他摄像头参数'], tr.td.span.contents[0])
elif tr.th.text == u'后置摄像头':
pass
else:
if tr.th.text not in unknown_list:
print 'new parm: ', tr.th.text, phone_detail_url
unknown_list.append(tr.th.text)
except:
pass # 大表格外面的标题为none,会报错
# 获取摄像头的代码结束↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
wb.save(wb_name)
rows += 1
sleep_time = random.randint(1, 3) # 定义一个随机睡眠时间,防止被识别为爬虫,可能有点作用。
# time.sleep(sleep_time)
if __name__ == "__main__":
# zol_spider(2019)
if len(sys.argv) <= 1:
zol_spider("2019")
elif sys.argv[1] in urls.keys():
zol_spider(sys.argv[1])
else:
print('wrong argument, only support [2016-2019]')