Skip to content

Commit

Permalink
added working version of scraper for atlascen
Browse files Browse the repository at this point in the history
  • Loading branch information
Many98 committed Oct 4, 2022
1 parent d8233f8 commit a2b0804
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions src/scrapers/scrape_atlas_cen.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def scrape(self, driver: WebDriver, url: str) -> list:
'//*[@id="__next"]/div/main/div[2]/div[2]/div[2]/div[1]/div/div/p')))
time.sleep(3)
page_count = int(re.sub('[^0-9]', '', ele.text)) // 30 + 1
while pagination: # page <= page_count
stop = 0
while pagination and page <= page_count: # page <= page_count
time.sleep(1)
x = int(50 * page / page_count) + 1
print(f'Page {page}/{page_count} [{u"█" * x}{"." * (51 - x)}]', end="\r", flush=True) # 30 results per page is set in query
Expand All @@ -51,8 +52,14 @@ def scrape(self, driver: WebDriver, url: str) -> list:
except:
# probably pop-up screen appeared
self.close_pop_up(driver)
time.sleep(2)
if stop == 3:
driver.refresh()
elif stop > 3:
raise Exception('Unknown page error')
stop += 1
continue

stop = 0
time.sleep(1)

dynamic_button = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH,
Expand All @@ -71,17 +78,22 @@ def scrape(self, driver: WebDriver, url: str) -> list:
data = [{'when': i.text, 'href': j.get_attribute('href'), 'area': k.text, 'price': l.text} for i, j, k, l
in zip(when, href, area, price)]
data_out = [i for i in data if i['href'] not in existing_hrefs]
ll = [list(map(float, i['href'][53:89].replace('%', '').split('2C'))) for i in data_out] # TODO implement it with regex
ll = [list(map(float, re.findall("\d+\.\d+", i['href'][:110]))) for i in data_out]
for i, j in zip(data_out, ll):
i.update({'long': j[1], 'lat': j[0]})
# finally filter already existing data according to href attribute (kind of unique identifier)
self.data = data_out
if len(data_out) != len(data):
print('Some records already exists')
print('Some records already exists', end="\r", flush=True)
except:
print('something went wrong')
self.close_pop_up(driver)
time.sleep(3)
if stop == 3:
driver.refresh()
elif stop > 3:
raise Exception('Unknown page error')
stop += 1
continue

try:
Expand All @@ -93,7 +105,11 @@ def scrape(self, driver: WebDriver, url: str) -> list:
next_page_elem[-1].click()
except:
self.close_pop_up(driver) # close pop-up
next_page_elem[-1].click() # go to next page
try:
next_page_elem[-1].click() # go to next page
except:
pagination = False
continue
except:
pagination = False
continue
Expand Down

0 comments on commit a2b0804

Please sign in to comment.