-
Notifications
You must be signed in to change notification settings - Fork 0
/
basespider.py
62 lines (49 loc) · 1.87 KB
/
basespider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
from lxml import etree
from utils import get_request_header
class BaseSpider(object):
urls = []
group_xpath = ""
son_xpath = {}
def __init__(self, group_xpath='', son_xpath={}, urls=[]):
if group_xpath:
self.group_xpath = group_xpath
if son_xpath:
self.son_xpath = son_xpath
if urls:
self.urls = urls
def handle_no_data(self, li):
return li[0] if len(li) != 0 else ''
def get_page(self, url):
try:
return requests.get(url, headers=get_request_header()).content.decode()
except Exception as e:
return requests.get(url, headers=get_request_header()).content.decode("GBK")
def get_son_info(self, trs):
for tr in trs:
ip = self.handle_no_data(tr.xpath(self.son_xpath["ip"])).strip('\n').strip(' ')
port = self.handle_no_data(tr.xpath(self.son_xpath["port"])).strip('\n').strip(' ')
region = self.handle_no_data(tr.xpath(self.son_xpath["region"])).strip('\n').strip(' ')
yield [ip, port, region]
def parse_page(self, page):
element = etree.HTML(page)
trs = element.xpath(self.group_xpath)
return trs
def get_results(self):
for url in self.urls:
page = self.get_page(url)
trs = self.parse_page(page)
yield from self.get_son_info(trs)
if __name__ == "__main__":
# 测试爬取66ip网的代理ip
obj = BaseSpider(
group_xpath='//*[@id="main"]/div/div[1]/table/tr[position()>1]',
urls=[f"http://www.66ip.cn/{i}.html" for i in range(1, 4)],
son_xpath={
"ip": './td[1]/text()',
"port": './td[2]/text()',
"region": './td[3]/text()'
}
)
for info in obj.get_results():
print(info)