-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
105 lines (96 loc) · 4.54 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# import libraries
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import time
import pandas as pd
import os
from pprint import pprint
from datetime import datetime as DateTime, timedelta as TimeDelta
from dotenv import load_dotenv
import csv
from io import StringIO
from sqlalchemy import create_engine
load_dotenv()
DBPATH = os.getenv('DBPATH')
s = '2019-09-15'
search_period = 1
trip_leght = 24
departure_city = 'HOU'
arrival_city = 'RIO'
data = []
db_table = 'flights'
def create_run(engine):
q = engine.execute("INSERT INTO runs(departure_city,arrival_city,datetime) VALUES ('" + departure_city + "', '" + arrival_city + "', current_timestamp) RETURNING id;")
q_result = q.first()[0]
return q_result
def run_driver(run_number):
def scrape(way):
outbound_flights = driver.find_elements_by_css_selector("div[class^='flight-item ']")
count_flights = len(outbound_flights)
print('[', way ,'] Number of Flights:', count_flights)
if count_flights == 0:
pprint("Blocked: Sleeping a little")
time.sleep(600)
for outbound_flight in outbound_flights:
flighttimes=[]
flightairports=[]
flightdates=[]
airline = outbound_flight.find_element_by_css_selector("span[class='airline-name']")
duration = outbound_flight.find_element_by_css_selector("span[class^='flight-duration']")
stops = outbound_flight.find_element_by_css_selector("span[class='flight-stops']")
price = outbound_flight.find_element_by_css_selector("span[id^='tooltip-flight']")
details = outbound_flight.find_elements_by_css_selector("div.col-xs-4.col-md-4:not(.no-padding)")
for detail in details:
flight_times = detail.find_element_by_css_selector("span[class='flight-time']")
flighttimes.append(flight_times)
flight_airports = detail.find_element_by_css_selector("span[class='flight-destination']")
flightairports.append(flight_airports)
flight_dates = detail.find_element_by_css_selector("span[class='flight-data']")
flightdates.append(flight_dates)
data.append({"way" : way, "airline" : airline.text , "departure_airport" : flightairports[0].text, "departure_date" : flightdates[0].text, "departure_time" : flighttimes[0].text, "duration" : duration.text, "stops" : stops.text, "arrival_airport" : flightairports[1].text ,"arrival_date" : flightdates[0].text , "arrival_time" : flighttimes[1].text , "price" : price.text[3:-1] , "run_id" : run_number})
for day in range(search_period):
date = DateTime.strptime(s, "%Y-%m-%d")
outbound_date = (date + TimeDelta(days=day)).strftime('%Y-%m-%d')
inbound_date = (date + TimeDelta(days=trip_leght) + TimeDelta(days=day)).strftime('%Y-%m-%d')
urlpage = 'https://www.maxmilhas.com.br/busca-passagens-aereas/RT/' + departure_city + '/' + arrival_city + '/' + outbound_date + '/' + inbound_date + '/1/0/0/EC'
options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get(urlpage)
print('To: ' + outbound_date + ' From: ' + inbound_date)
time.sleep(40)
try:
scrape("Outbound")
driver.find_element_by_xpath("//span[text()='volta']").click()
scrape("Inbound")
except: continue
driver.quit()
def insert_db(data,engine):
def psql_insert_copy(table, conn, keys, data_iter):
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
df = pd.DataFrame(data)
df.to_sql(db_table, engine, method=psql_insert_copy, if_exists='append', index=False)
def main():
engine = create_engine(DBPATH)
run_number = create_run(engine)
run_driver(run_number)
insert_db(data,engine)
pprint('Finished')
if __name__ == "__main__":
main()