forked from jhao104/proxy_pool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetFreeProxy.py
186 lines (167 loc) · 5.82 KB
/
getFreeProxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name: GetFreeProxy.py
Description : 抓取免费代理
Author : JHao
date: 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25:
-------------------------------------------------
"""
import re
import requests
try:
from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
except:
import sys # py2
reload(sys)
sys.setdefaultencoding('utf-8')
from Util.utilFunction import robustCrawl, getHtmlTree
from Util.WebRequest import WebRequest
# for debug to disable insecureWarning
requests.packages.urllib3.disable_warnings()
class GetFreeProxy(object):
"""
proxy getter
"""
def __init__(self):
pass
@staticmethod
def freeProxyFirst(page=10):
"""
抓取无忧代理 http://www.data5u.com/
:param page: 页数
:return:
"""
url_list = ['http://www.data5u.com/',
'http://www.data5u.com/free/',
'http://www.data5u.com/free/gngn/index.shtml',
'http://www.data5u.com/free/gnpt/index.shtml']
for url in url_list:
html_tree = getHtmlTree(url)
ul_list = html_tree.xpath('//ul[@class="l2"]')
for ul in ul_list:
try:
yield ':'.join(ul.xpath('.//li/text()')[0:2])
except Exception as e:
pass
@staticmethod
def freeProxySecond(proxy_number=100):
"""
抓取代理66 http://www.66ip.cn/
:param proxy_number: 代理数量
:return:
"""
url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
proxy_number)
request = WebRequest()
# html = request.get(url).content
# content为未解码,text为解码后的字符串
html = request.get(url).text
for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html):
yield proxy
@staticmethod
def freeProxyThird(days=1):
"""
抓取ip181 http://www.ip181.com/
:param days:
:return:
"""
url = 'http://www.ip181.com/'
html_tree = getHtmlTree(url)
try:
tr_list = html_tree.xpath('//tr')[1:]
for tr in tr_list:
yield ':'.join(tr.xpath('./td/text()')[0:2])
except Exception as e:
pass
@staticmethod
def freeProxyFourth():
"""
抓取西刺代理 http://api.xicidaili.com/free2016.txt
:return:
"""
url_list = ['http://www.xicidaili.com/nn', # 高匿
'http://www.xicidaili.com/nt', # 透明
]
for each_url in url_list:
tree = getHtmlTree(each_url)
proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
for proxy in proxy_list:
try:
yield ':'.join(proxy.xpath('./td/text()')[0:2])
except Exception as e:
pass
@staticmethod
def freeProxyFifth():
"""
抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
:return:
"""
url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
for page in range(1, 10):
page_url = url.format(page=page)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath('//td[@class="ip"]')
# 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
# 需要过滤掉<p style="display:none;">的内容
xpath_str = """.//*[not(contains(@style, 'display: none'))
and not(contains(@style, 'display:none'))
and not(contains(@class, 'port'))
]/text()
"""
for each_proxy in proxy_list:
try:
# :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))
port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
yield '{}:{}'.format(ip_addr, port)
except Exception as e:
pass
@staticmethod
def freeProxySixth():
"""
抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10
:return:
"""
url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
request = WebRequest()
try:
res = request.get(url).json()
for row in res['RESULT']['rows']:
yield '{}:{}'.format(row['ip'], row['port'])
except Exception as e:
pass
@staticmethod
def freeProxySeventh():
"""
快代理免费https://www.kuaidaili.com/free/inha/1/
"""
url = 'https://www.kuaidaili.com/free/inha/{page}/'
for page in range(1, 10):
page_url = url.format(page=page)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath('.//table//tr')
for tr in proxy_list[1:]:
yield ':'.join(tr.xpath('./td/text()')[0:2])
if __name__ == '__main__':
gg = GetFreeProxy()
# for e in gg.freeProxyFirst():
# print(e)
#
# for e in gg.freeProxySecond():
# print(e)
#
# for e in gg.freeProxyThird():
# print(e)
# for e in gg.freeProxyFourth():
# print(e)
# for e in gg.freeProxyFifth():
# print(e)
# for e in gg.freeProxySixth():
# print(e)
for e in gg.freeProxySeventh():
print(e)