-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data_Extraction.py
164 lines (145 loc) · 7.29 KB
/
Data_Extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import csv
import re
from selenium import webdriver
import time
from bs4 import BeautifulSoup
chrome = r"C:\Users\Ishant\Downloads\chromedriver.exe" #chromedriver
driver = webdriver.Chrome(chrome) #initialise chrome webdriver
url_extension = 'https://www.99acres.com/'
urls_page = ['https://www.99acres.com/property-in-santacruz-west-mumbai-south-west-ffid-page-'+
str(i) for i in range(1, 10)] #pagenation links
with open('C:\Users\Ishant\Desktop\\' + 'Santacruz(West).csv', 'a') as f: #creating csv file to store extracted data
writer = csv.writer(f)
rows = zip(['SuperBuiltUp Area'], ['Area'], ['Carpet Area'], ['Configuration'], ['Price'], ['Price per sq.Ft'], ['Age'],
['Floor Number'], ['Address'], ['Parking'], ['Furnishing'], ['Location'], '\n')
for row in rows:
print row
writer.writerow(row)
f.close
for x in urls_page:
try:
driver.get(x) #connecting to the required website
try:
while True: #used to simulate scrolling action of a webpage on selenium browser
last_height = driver.execute_script("return document.body.scrollHeight")
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(1.5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
except:
print ''
page = driver.page_source
soup = BeautifulSoup(page,'html.parser')
ab = soup.findAll('div', {"class": "wrapttl"}) #get all span tags which contain 'a' tags
for span in ab: #go through all links found on the page
try:
links = span.find_all('a') #extract the required links only
for i in links:
driver.get(url_extension+i['href'])
pg = driver.page_source
soup = BeautifulSoup(pg, 'html.parser')
area = soup.find('span',{'id':'builtupArea_span'})
if area is None:
area = '-'
else:
area = area.text
superbuiltuparea = soup.find('span', {'id': 'superbuiltupArea_span'})
if superbuiltuparea is None:
superbuiltuparea = '-'
else:
superbuiltuparea = superbuiltuparea.text
carpetarea = soup.find('span', {'id': 'carpetArea_span'})
if carpetarea is None:
carpetarea = '-'
else:
carpetarea = carpetarea.text
bedroom = soup.find('span', {'id': 'bedRoomNum'})
if bedroom is None:
bedroom = ''
else:
bedroom = bedroom.text
bathroom = soup.find('span', {'id': 'bathroomNum'})
if bathroom is None:
bathroom = ''
else:
bathroom = bathroom.text
balcony = soup.find('span', {'id': 'balconyNum'})
if balcony is None:
balcony = ''
else:
balcony = balcony.text
configuration = bathroom + bedroom + balcony
if configuration is None:
configuration = '-'
else:
configuration = configuration
price = soup.find('span',{'class':'pdPropValue'})
if price is None:
price = '-'
else:
price = price.text
price_per_sq_feet = driver.find_element_by_xpath("""/html/body/div[4]/div[4]/div[1]/div[2]/div/table/tbody/tr[2]/td[1]/div[3]""")
if price_per_sq_feet is None:
price_per_sq_feet = '-'
else:
price_per_sq_feet = price_per_sq_feet.text
price_per_sq_feet = re.sub('View Price Details','',price_per_sq_feet)
price_per_sq_feet = re.sub('@','',price_per_sq_feet)
age = driver.find_element_by_xpath("""//*[@id="agePossessionLbl"]""")
if age is None:
age = '-'
else:
age = age.text
floorno = soup.find('span', {'id': 'floorNumLabel'})
if floorno is None:
floorno = '-'
else:
floorno = floorno.text
furnishing = soup.find('span', {'id': 'furnishing'})
if furnishing is None:
furnishing = '-'
else:
furnishing = furnishing.text
parking = soup.find('span', {'id': 'reservedParking'})
if parking is None:
parking = '-'
else:
parking = parking.text
if parking.find('Covered'):
parking = re.sub('Covered','',parking)
if parking.find('Open'):
parking = re.sub('Open', '', parking)
address1 = driver.find_element_by_xpath("""/html/body/div[4]/div[4]/div[1]/div[2]/div/table/tbody/tr[2]/td[2]/div[2]""")
if address1 is None:
address1 = '-'
else:
address1 = address1.text
address2 = driver.find_element_by_xpath("""/html/body/div[4]/div[4]/div[1]/div[2]/div/table/tbody/tr[2]/td[2]/div[3]""")
if address2 is None:
address2 = ''
else:
address2 = address2.text
address = address1+address2
print superbuiltuparea
print carpetarea
with open('C:\Users\Ishant\Desktop\\'+'Santacruz(West).csv','a') as f:
writer = csv.writer(f)
rows = zip([superbuiltuparea], [area],[carpetarea],[configuration], [price],[price_per_sq_feet],[age],[floorno],[address],[parking],[furnishing] ,['Santacruz West'],'\n') #zip takes multiple iterables and returns an iterator of tuples
for row in rows:
print row
writer.writerow(row)
f.close
except Exception as e:
with open('C:\Users\Ishant\Desktop\\' + 'log_Santacruz(West).csv', 'a') as f1:
f1.write(str(url_extension+i['href'])+'\n')
f1.close()
except Exception as e1:
with open('C:\Users\Ishant\Desktop\\' + 'log_Santacruz(West)_pagelinks.csv', 'a') as f1:
f1.write(str(url_extension + i['href']) + '\n')
f1.close()
driver.close()