Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for proxy, change throttle, fix errors #14

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions .env.dist
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#
# Basic settings
#
#PROXY="http://localhost:8080"
#CA_CERT="/home/user/customproxy_ca.crt"

#throttle in s
#THROTTLE=2

# Airbnb client key
AIRBNB_API_KEY=d306zoyjsyarp7ifhu67rjxn52tv0t20
Expand Down
24 changes: 16 additions & 8 deletions stl/command/stl_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,18 @@ def execute(self):
scraper.run(source, self.__args.get('--updated'))

elif self.__args.get('data'):
pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, self.__logger)
ca_cert = os.getenv('CA_CERT', None)
throttle = int(os.getenv('THROTTLE', 2))
pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger)
print(json.dumps(pdp.get_raw_listing(self.__args.get('<listingId>'))))

elif self.__args.get('pricing'):
listing_id = self.__args.get('<listingId>')
checkin = self.__args.get('--checkin')
checkout = self.__args.get('--checkout')
pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, self.__logger)
ca_cert = os.getenv('CA_CERT', None)
throttle = int(os.getenv('THROTTLE', 2))
pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger)
total = pricing.get_pricing(checkin, checkout, listing_id)
print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total))

Expand All @@ -103,14 +107,18 @@ def __create_scraper(
) -> AirbnbScraperInterface:
"""Create scraper of given type using given parameters."""
api_key = os.getenv('AIRBNB_API_KEY')
proxy = os.getenv('PROXY', None)
ca_cert = os.getenv('CA_CERT', None)

throttle = int(os.getenv('THROTTLE', 2))
if scraper_type == 'search':
explore = Explore(api_key, currency, self.__logger)
pdp = Pdp(api_key, currency, self.__logger)
reviews = Reviews(api_key, currency, self.__logger)
return AirbnbSearchScraper(explore, pdp, reviews, persistence, self.__logger)
explore = Explore(api_key, currency, proxy, ca_cert, throttle, self.__logger)
pdp = Pdp(api_key, currency, proxy, ca_cert, throttle, self.__logger)
reviews = Reviews(api_key, currency, proxy, ca_cert, throttle, self.__logger)
return AirbnbSearchScraper(explore, pdp, reviews, persistence,currency, self.__logger)
elif scraper_type == 'calendar':
pricing = Pricing(api_key, currency, self.__logger)
calendar = Calendar(api_key, currency, self.__logger, pricing)
pricing = Pricing(api_key, currency, proxy, ca_cert, throttle, self.__logger)
calendar = Calendar(api_key, currency, proxy, ca_cert, throttle, self.__logger, pricing)
return AirbnbCalendarScraper(calendar, persistence, self.__logger)
else:
raise RuntimeError('Unknown scraper type: %s' % scraper_type)
Expand Down
31 changes: 20 additions & 11 deletions stl/endpoint/base_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,23 @@

from stl.exception.api import ApiException, ForbiddenException

import urllib3
urllib3.disable_warnings()


class BaseEndpoint(ABC):
API_PATH = None
SOURCE = 'airbnb'

def __init__(self, api_key: str, currency: str, logger: Logger, locale: str = 'en'):
def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle:int, logger: Logger, locale: str = 'en'):
self._api_key = api_key
self._currency = currency
self._locale = locale
self._logger = logger
self._proxy = {'http': proxy,
'https': proxy}
self._throttle=throttle
self._ca_cert = ca_cert

@staticmethod
def build_airbnb_url(path: str, query=None):
Expand All @@ -30,20 +37,22 @@ def build_airbnb_url(path: str, query=None):
def _api_request(self, url: str, method: str = 'GET', data=None) -> dict:
if data is None:
data = {}

attempts = 0
headers = {'x-airbnb-api-key': self._api_key}
max_attempts = 3
max_attempts = 5
while attempts < max_attempts:
sleep(randint(0, 2)) # do a little throttling
sleep(randint(0,self._throttle)) # do a little throttling
attempts += 1
response = requests.request(method, url, headers=headers, data=data)
response_json = response.json()
errors = response_json.get('errors')
if not errors:
return response_json

self.__handle_api_error(url, errors)
response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._ca_cert)
try:
response_json = response.json()
errors = response_json.get('errors')
if not errors:
return response_json
else:
self.__handle_api_error(url, errors)
except:
print(f'ERROR ap_request -- {response.text}')

raise ApiException(['Could not complete API {} request to "{}"'.format(method, url)])

Expand Down
4 changes: 2 additions & 2 deletions stl/endpoint/calendar.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ class Calendar(BaseEndpoint):
API_PATH = '/api/v3/PdpAvailabilityCalendar'
N_MONTHS = 12 # number of months of data to return; 12 months == 1 year

def __init__(self, api_key: str, currency: str, logger: Logger, pricing: Pricing):
super().__init__(api_key, currency, logger)
def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: int, logger: Logger, pricing: Pricing):
super().__init__(api_key, currency, proxy, ca_cert, throttle, logger)
self.__pricing = pricing
self.__today = datetime.today()

Expand Down
1 change: 1 addition & 0 deletions stl/endpoint/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def get_url(self, search_string: str, params: dict = None):

def search(self, url: str):
data = self._api_request(url)
#print(data['data']['dora']['exploreV3']['metadata']['paginationMetadata'])
pagination = data['data']['dora']['exploreV3']['metadata']['paginationMetadata']

return data, pagination
28 changes: 15 additions & 13 deletions stl/endpoint/pdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ class Pdp(BaseEndpoint):

SECTION_NAMES = ['amenities', 'description', 'host_profile', 'location', 'policies']

def __init__(self, api_key: str, currency: str, logger: Logger):
super().__init__(api_key, currency, logger)
self.__geocoder = Geocoder()
def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: bool, logger: Logger):
super().__init__(api_key, currency, proxy, ca_cert,throttle, logger)
self.__geocoder = Geocoder(proxy,ca_cert)
self.__regex_amenity_id = re.compile(r'^([a-z0-9]+_)+([0-9]+)_')

@staticmethod
Expand Down Expand Up @@ -359,8 +359,9 @@ def __determine_city_and_neighborhood(self, listing: dict, geography: dict):
if reverse_geo_address['city'] in [search_city, city, localized_city] or self.__geocoder.is_city(reverse_geo_address['city'], reverse_geo_address['country']):
return reverse_geo_address['city'], localized_neighborhood

if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']):
return city or localized_city, neighborhood
if reverse_geo_address :
if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']):
return city or localized_city, neighborhood

return city, neighborhood

Expand Down Expand Up @@ -391,8 +392,8 @@ def __get_price_key(pricing) -> str:
def __get_price_rate(pricing) -> int | None:
if pricing:
price_key = Pdp.__get_price_key(pricing)
return int(pricing['structuredStayDisplayPrice']['primaryLine'][price_key].lstrip('$').replace(',', ''))

res=pricing['structuredStayDisplayPrice']['primaryLine'][price_key].replace('\xa0',' ')
return int ( ''.join(filter(str.isdigit, res) ) )
return None

@staticmethod
Expand All @@ -406,16 +407,17 @@ def __get_rate_type(pricing) -> str | None:
def __get_total_price(pricing) -> int | None:
if pricing['structuredStayDisplayPrice']['secondaryLine']:
price = pricing['structuredStayDisplayPrice']['secondaryLine']['price']
amount_match = re.match(r'\$([\w,]+) total', price)
else:
price_key = Pdp.__get_price_key(pricing)
price = pricing['structuredStayDisplayPrice']['primaryLine'][price_key]
amount_match = re.match(r'\$([\w,]+)', price)

if not amount_match:
raise ValueError('No amount match found for price: %s' % price)

amount_match = ''.join(filter(str.isdigit, price) )

return int(amount_match[1].replace(',', ''))
if amount_match =='':
#raise ValueError('No amount match found for price: %s' % price)
return None
else:
return int(amount_match)

@staticmethod
def __html_to_text(html: str) -> str:
Expand Down
17 changes: 10 additions & 7 deletions stl/endpoint/reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ def __get_reviews_batch(self, listing_id: str, limit: int, offset: int):
else:
n_reviews_total = 0

reviews = [{
'comments': r['comments'],
'created_at': r['createdAt'],
'language': r['language'],
'rating': r['rating'],
'response': r['response'],
} for r in pdp_reviews['reviews']]
if pdp_reviews!=None:
reviews = [{
'comments': r['comments'],
'created_at': r['createdAt'],
'language': r['language'],
'rating': r['rating'],
'response': r['response'],
} for r in pdp_reviews['reviews']]
else:
reviews=[]

return reviews, n_reviews_total

Expand Down
49 changes: 34 additions & 15 deletions stl/geo/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,27 @@
from geopy.extra.rate_limiter import RateLimiter
from random import randint

import ssl
import geopy.geocoders



class Geocoder:

def __init__(self) -> None:
def __init__(self,proxy,ca_cert) -> None:
gmaps_api_key = os.environ.get('GMAPS_API_KEY')
self.__gmaps = GoogleV3(api_key=gmaps_api_key) if gmaps_api_key else None
user_agent = 'stl-scraper-{}'.format(randint(1, 10000))
self.__geolocator = Nominatim(user_agent=user_agent)
user_agent = 'stl-scraper-test-{}'.format(randint(1, 100000))

proxy = {'http': proxy,
'https': proxy}

if ca_cert:
ctx = ssl.create_default_context(cafile=ca_cert)
geopy.geocoders.options.default_ssl_context = ctx
geopy.geocoders.options.default_timeout = 2

self.__geolocator = Nominatim(user_agent=(user_agent),proxies=proxy)
self.__osm_reverse_geo = RateLimiter(self.__geolocator.reverse, min_delay_seconds=1)

def is_city(self, name: str, country: str):
Expand All @@ -27,20 +40,26 @@ def is_city(self, name: str, country: str):
def reverse(self, lat: float, lon: float) -> dict | bool:
"""Tries OSM reverse geocoder (Nomatim) first. If it fails, tries Google Maps reverse geocoder (untested)."""
# Try OSM
address = self.__osm_reverse_geo((lat, lon), language='en').raw['address']
if 'city' in address:
return address
if 'town' in address:
address['city'] = address['town']
return address
if 'state' in address:
address['city'] = address['state']
return address
try:
address = self.__osm_reverse_geo((lat, lon), language='en').raw['address']
if 'city' in address:
return address
if 'town' in address:
address['city'] = address['town']
return address
if 'state' in address:
address['city'] = address['state']
return address
except:
pass

# Else try google maps
if self.__gmaps:
address = self.__gmaps.reverse((lat, lon), language='en')
if 'city' in address:
return address
try:
address = self.__gmaps.reverse((lat, lon), language='en')
if 'city' in address:
return address
except:
pass

return False
8 changes: 6 additions & 2 deletions stl/persistence/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ class Csv(PersistenceInterface):
def __init__(self, csv_path: str):
self.__csv_path = csv_path

def save(self, query: str, listings: list):
with open(self.__csv_path, 'w', encoding='utf-8', newline='') as csvfile:
def save(self, query: str, listings: list,continuous:bool=False):
if continuous==False:
action='w'
else:
action='a'
with open(self.__csv_path, action, encoding='utf-8', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys())
writer.writeheader()
writer.writerows(listings)
2 changes: 1 addition & 1 deletion stl/persistence/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def mark_deleted(self, listing_id: str):
"""Mark a listing as deleted by setting the 'deleted' field to True."""
self.__es.update(index=self.__index, id=listing_id, doc={'deleted': True})

def save(self, query: str, listings: list):
def save(self, query: str, listings: list,continuous:bool=False):
"""Bulk save listings by upsert."""
bulk(self.__es, index=self.__index, actions=[{
'_op_type': 'update',
Expand Down
Loading