-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_spider.py
83 lines (58 loc) · 2.37 KB
/
google_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import urllib3
import time
urllib3.disable_warnings()
years= range(2009,2021) # the year you want to search
requests.adapters.DEFAULT_RETRIES = 5
def search(year,ename,cname):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.78'}
# tbm is the param that control the theme of the search
r= requests.get("https://www.google.com/search",params={'tbm':'nws',\
'q':ename+" OR "+cname+" after:"+str(year)+"-01-01"+" before:"+str(year)+"-12-31"},\
headers=headers,verify=False)
return r
def loc_num(r):
soup=BeautifulSoup(r.text,'html.parser')
text=soup.find(id="result-stats")
if text:
text=text.get_text()
text = text.replace(',','')
result_stats = re.search('[1-9]\d*',text).group()
else:
result_stats = 0
return result_stats
if __name__ == "__main__":
df = pd.read_excel('result.xlsx') #open file as pd
for item in df.iterrows(): # iteration by row
index = item[0]
row = item[1] # the tuple
cname = row['①去后缀']
ename = row['②保留后缀']
for year in years:
try:
request = search(year,ename,cname)
result_stats=loc_num(request) # get result by bs and re
df.loc[index,year] = int(result_stats)
print(cname+' '+ename+' '+str(year)+' '+str(result_stats))
except:
print("error")
time.sleep(index%10) # sleep for some time
df.to_excel('result.xlsx')
for item in df.iterrows(): # iteration by row
index = item[0]
row = item[1] # the tuple
cname = row['①去后缀']
ename = row['②保留后缀']
for year in years:
if pd.isnull(df.loc[index,year]) == True: # check for nan and retry
try:
request = search(year,ename,cname)
result_stats=loc_num(request) # get result by bs and re
df.loc[index,year] = int(result_stats)
print(cname+' '+ename+' '+str(year)+' '+str(result_stats))
except:
print("error")
df.to_excel('result.xlsx')