-
Notifications
You must be signed in to change notification settings - Fork 0
/
BaiduSpider3.py
168 lines (138 loc) · 4.93 KB
/
BaiduSpider3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# -*- coding: utf-8 -*-
"""
@author:随时静听
@file: baiduSpider2.py
@time: 2018/09/21
"""
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
#抓取多少页
pages=10
#搜索的关键字
keyword=u"inurl:?.asp学校"
#百度搜索URL地址
url="https://www.baidu.com/"
#获取页面的有效地址
def getUrls(driver):
urls=[]
#获取当前窗口
current_window=driver.current_window_handle
#百度跳转地址
for a in driver.find_elements_by_xpath("//div[@id='content_left']//div/h3/a"):
a.click()#点击进行跳转
#获取打开的所有窗口,一会要将打开的窗口关闭,防止耗用电脑性能
all_windows=driver.window_handles
#所有窗口包含打开的搜索页面,不能将手搜页面关闭,所以将搜索页面剔除
all_windows.remove(current_window)
#对打开的页面进行关闭操作,这里一定要先关闭,再切换窗口,当前的driver是最后一个打开页面的窗口
#因此如果先关闭的话容易导致出错,自行switchs 切换窗口已经销毁
for window in all_windows:
driver.switch_to_window(window)
urls.append(driver.current_url)#统计获取到的URL
print "\t[-] "+driver.current_url #打印下当前的URL
print "[*] The number of pages parsed to URL is :"+str(len(all_windows))#打印页面获取了多少URL
#复原
#执行 其实这里可以和上面的循环一起处理
for window in all_windows[:]:
driver.switch_to_window(window)
driver.close()
#重新切换到搜索页面的窗口
driver.switch_to_window(current_window)
#返回抓取的URL和切换后的web对象
return urls,driver
#执行搜索动作,并将改变状态后的web对象返回
def doSearch(url,keyword,driver):
driver.get(url)
driver.find_element_by_id('kw').send_keys(keyword)
driver.find_element_by_id('su').submit()
time.sleep(3)
return driver
#执行点击下一页操作
def toNextpage(driver):
pages = driver.find_elements_by_xpath("//div[@id='page']//a")
if pages:
pages[-1].click()
return driver
# 获取传递的参数:
def getargs():
import sys
usage='''
Usage: BaiduSpider keywords [nums] [filename]
keywords: your Search keywords
nums: default value is 6,seach max pages
filename: result saved file name,default value is result.txt
'''
if len(sys.argv)<2:
print usage
exit()
else:
try:
keywords= sys.argv[1]
if len(sys.argv)==2:
# keywords= sys.argv[1] if sys.argv[1] else ""
nums= 6
filename="result.txt"
return (keywords,nums,filename)
if len(sys.argv)==3:
nums= int(sys.argv[2])
filename = "result.txt"
return (keywords, nums, filename)
if len(sys.argv)==4:
nums = int(sys.argv[2])
filename=sys.argv[3]
return (keywords, nums, filename)
except:
print "[!] Parameter error !"
print usage
exit()
def saveData(urls,filename):
if urls:
with open(filename,'a+') as f:
for url in urls:
f.write(url+'\n')
#运行
def run():
# pages=11
#
# urls=[]
# driver = webdriver.Chrome()
keyword,pages,filename=getargs()
print "[*] Keywords:"+keyword
print "[*] nums:"+str(pages)
print "[*] save file name:"+filename
print "\n"
print "--"*30
urls=[]
try:
# options=webdriver.P
# options.add_argument('--ignore-certificate-errors')
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap[
"phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
# 不加载图片
dcap["phantomjs.page.settings.loadImages"] = False
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_page_load_timeout(20)
driver.implicitly_wait(6)
driver=doSearch(url,keyword,driver)
for i in range(pages+1):
print "[*] Page data loading: "+str(i+1)
url_lst,driver=getUrls(driver)
urls.extend(url_lst)
driver=toNextpage(driver)
time.sleep(5)
saveData(url_lst,filename)
print "[*] The urls total num is:" + str(len(urls))
print "[*] The spider urls save as file:" + filename
except Exception as e:
print "[!] There seems to be a mistake! The error message is as follows: "
print "--"*10+"Error Message Info"+"--"*10
print e
print "--" * 10 + "Error Message Info END" + "--" * 10
finally:
#关闭浏览器
driver.quit()
if __name__ == '__main__':
run()
pass