-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path链家租房房源信息.py
140 lines (129 loc) · 5.86 KB
/
链家租房房源信息.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# 安居客北京市租房信息,并导入本地数据库
import requests
import re
import pymysql
db = pymysql.connect('localhost', 'root', '636458', 'petzhang')
cursor = db.cursor()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36'}
def getdata(n):
first_url = 'https://bj.lianjia.com/zufang/chaoyang/pg{}'.format(n)
response = requests.get(first_url, headers=headers)
# print(response.text)
# 市区(朝阳、海淀等等)
loc_data1 = re.findall(r'<a target="_blank" href="/zufang/(.*?)/a>-<a href="/zufang/', response.text)
loca1 = []
for m in range(len(loc_data1)):
locdata1 = re.findall(r'[\u4e00-\u9fa5]+', loc_data1[m])
locdata1 = ''.join(locdata1)
loca1.append((locdata1))
# 应该是办事处吧
loca2 = re.findall(r'target="_blank">(.*?)</a>-<a title=', response.text)
# 应该是小区
loca3 = re.findall(r'</a>-<a title="(.*?)" href=', response.text)
detail_url = re.findall(r'<a target="_blank" href="/zufang/BJ(.*?)">', response.text)
# print(detail_url)
url1 = []
title1 = []
price1=[]
method1=[]
leixing1=[]
square1=[]
chaoxiang1=[]
ruzhushijian1=[]
louceng1=[]
dianti1=[]
yongshui1=[]
yongdian1=[]
ranqi1=[]
cainuan1=[]
zuqi1=[]
agent1=[]
phone1=[]
for i in range(len(detail_url)):
detailurl = 'https://bj.lianjia.com/zufang/BJ{}'.format(detail_url[i])
detail_data = requests.get(detailurl, headers=headers)
#网址
url = detailurl
url1.append(url)
#房源标题
title = re.findall(r'<p class="content__title">(.*?)</p>', detail_data.text)
title1.append(title)
#价格
price=re.findall(r'<span>(.*?)</span>元/月', detail_data.text)
price1.append(price)
#租赁方式
method = re.findall(r'<li><span class="label">租赁方式:</span>(.*?)</li>', detail_data.text)
method1.append(method)
#房屋类型
leixing = re.findall(r'<li><span class="label">房屋类型:</span>(.*?)</li>', detail_data.text)
leixing1.append(leixing)
#面积
square = re.findall(r'<li class="fl oneline">面积:(.*?)</li>', detail_data.text)
square1.append(square)
#朝向
chaoxiang = re.findall(r'<li class="fl oneline">朝向:(.*?)</li>', detail_data.text)
chaoxiang1.append(chaoxiang)
#入住
ruzhushijian = re.findall(r'<li class="fl oneline">入住:(.*?)</li>', detail_data.text)
ruzhushijian1.append(ruzhushijian)
#楼层
louceng = re.findall(r'<li class="fl oneline">楼层:(.*?)</li>', detail_data.text)
louceng1.append(louceng)
#电梯
dianti = re.findall(r'<li class="fl oneline">电梯:(.*?)</li>', detail_data.text)
dianti1.append(dianti)
#用水
yongshui = re.findall(r'<li class="fl oneline">用水:(.*?)</li>', detail_data.text)
yongshui1.append(yongshui)
#用电
yongdian = re.findall(r'<li class="fl oneline">用电:(.*?)</li>', detail_data.text)
yongdian1.append(yongdian)
#燃气
ranqi = re.findall(r'<li class="fl oneline">燃气:(.*?)</li>', detail_data.text)
ranqi1.append(ranqi)
#采暖
cainuan = re.findall(r'<li class="fl oneline">采暖:(.*?)</li>', detail_data.text)
cainuan1.append(cainuan)
#租期
zuqi = re.findall(r'<li class="fl oneline">租期:(.*?)</li>', detail_data.text)
zuqi1.append(zuqi)
#代理人
agent = re.findall(r'name":"(.*?)","office', detail_data.text)
agent1.append(agent)
#代理人联系方式
phone = re.findall(r'phone400":"(.*?)","phone', detail_data.text)
phone1.append(phone)
print('page'+'-'+str(n))
try:
for j in range(len(title1)):
#print(title1[j][0])
sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'\
.format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0],
price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0],
louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0],
cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0])
cursor.execute(sql)
db.commit()
print("已存储" + title1[j][0])
except Exception as e:
print(e)
'''
def savedata():
try:
for j in range(len(title1)):
# print(title1[j][0])
sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")' \
.format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0],
price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0],
louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0],
cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0])
cursor.execute(sql)
db.commit()
print("已存储" + title1[j][0])
except Exception as e:
print(e)
'''
if __name__ == '__main__':
for n in range(1,101):
getdata(n)