diff --git a/.env.dist b/.env.dist index f7d20f5..5f76bab 100644 --- a/.env.dist +++ b/.env.dist @@ -1,6 +1,11 @@ # # Basic settings # +#PROXY="http://localhost:8080" +#CA_CERT="/home/user/customproxy_ca.crt" + +#throttle in s +#THROTTLE=2 # Airbnb client key AIRBNB_API_KEY=d306zoyjsyarp7ifhu67rjxn52tv0t20 diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index bab6ab8..d926263 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -81,14 +81,18 @@ def execute(self): scraper.run(source, self.__args.get('--updated')) elif self.__args.get('data'): - pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, self.__logger) + ca_cert = os.getenv('CA_CERT', None) + throttle = int(os.getenv('THROTTLE', 2)) + pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger) print(json.dumps(pdp.get_raw_listing(self.__args.get('')))) elif self.__args.get('pricing'): listing_id = self.__args.get('') checkin = self.__args.get('--checkin') checkout = self.__args.get('--checkout') - pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, self.__logger) + ca_cert = os.getenv('CA_CERT', None) + throttle = int(os.getenv('THROTTLE', 2)) + pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) @@ -103,14 +107,18 @@ def __create_scraper( ) -> AirbnbScraperInterface: """Create scraper of given type using given parameters.""" api_key = os.getenv('AIRBNB_API_KEY') + proxy = os.getenv('PROXY', None) + ca_cert = os.getenv('CA_CERT', None) + + throttle = int(os.getenv('THROTTLE', 2)) if scraper_type == 'search': - explore = Explore(api_key, currency, self.__logger) - pdp = Pdp(api_key, currency, self.__logger) - reviews = Reviews(api_key, currency, self.__logger) - return AirbnbSearchScraper(explore, pdp, reviews, persistence, self.__logger) + explore = Explore(api_key, currency, proxy, ca_cert, throttle, self.__logger) + pdp = Pdp(api_key, currency, proxy, ca_cert, throttle, self.__logger) + reviews = Reviews(api_key, currency, proxy, ca_cert, throttle, self.__logger) + return AirbnbSearchScraper(explore, pdp, reviews, persistence,currency, self.__logger) elif scraper_type == 'calendar': - pricing = Pricing(api_key, currency, self.__logger) - calendar = Calendar(api_key, currency, self.__logger, pricing) + pricing = Pricing(api_key, currency, proxy, ca_cert, throttle, self.__logger) + calendar = Calendar(api_key, currency, proxy, ca_cert, throttle, self.__logger, pricing) return AirbnbCalendarScraper(calendar, persistence, self.__logger) else: raise RuntimeError('Unknown scraper type: %s' % scraper_type) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index 495a375..2daa167 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -9,16 +9,23 @@ from stl.exception.api import ApiException, ForbiddenException +import urllib3 +urllib3.disable_warnings() + class BaseEndpoint(ABC): API_PATH = None SOURCE = 'airbnb' - def __init__(self, api_key: str, currency: str, logger: Logger, locale: str = 'en'): + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle:int, logger: Logger, locale: str = 'en'): self._api_key = api_key self._currency = currency self._locale = locale self._logger = logger + self._proxy = {'http': proxy, + 'https': proxy} + self._throttle=throttle + self._ca_cert = ca_cert @staticmethod def build_airbnb_url(path: str, query=None): @@ -30,20 +37,22 @@ def build_airbnb_url(path: str, query=None): def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: if data is None: data = {} - attempts = 0 headers = {'x-airbnb-api-key': self._api_key} - max_attempts = 3 + max_attempts = 5 while attempts < max_attempts: - sleep(randint(0, 2)) # do a little throttling + sleep(randint(0,self._throttle)) # do a little throttling attempts += 1 - response = requests.request(method, url, headers=headers, data=data) - response_json = response.json() - errors = response_json.get('errors') - if not errors: - return response_json - - self.__handle_api_error(url, errors) + response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._ca_cert) + try: + response_json = response.json() + errors = response_json.get('errors') + if not errors: + return response_json + else: + self.__handle_api_error(url, errors) + except: + print(f'ERROR ap_request -- {response.text}') raise ApiException(['Could not complete API {} request to "{}"'.format(method, url)]) diff --git a/stl/endpoint/calendar.py b/stl/endpoint/calendar.py index b6ee07f..0ef0603 100644 --- a/stl/endpoint/calendar.py +++ b/stl/endpoint/calendar.py @@ -133,8 +133,8 @@ class Calendar(BaseEndpoint): API_PATH = '/api/v3/PdpAvailabilityCalendar' N_MONTHS = 12 # number of months of data to return; 12 months == 1 year - def __init__(self, api_key: str, currency: str, logger: Logger, pricing: Pricing): - super().__init__(api_key, currency, logger) + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: int, logger: Logger, pricing: Pricing): + super().__init__(api_key, currency, proxy, ca_cert, throttle, logger) self.__pricing = pricing self.__today = datetime.today() diff --git a/stl/endpoint/explore.py b/stl/endpoint/explore.py index b7ace5c..9c21354 100644 --- a/stl/endpoint/explore.py +++ b/stl/endpoint/explore.py @@ -52,6 +52,7 @@ def get_url(self, search_string: str, params: dict = None): def search(self, url: str): data = self._api_request(url) + #print(data['data']['dora']['exploreV3']['metadata']['paginationMetadata']) pagination = data['data']['dora']['exploreV3']['metadata']['paginationMetadata'] return data, pagination diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index 19cf1b3..d528b83 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -70,9 +70,9 @@ class Pdp(BaseEndpoint): SECTION_NAMES = ['amenities', 'description', 'host_profile', 'location', 'policies'] - def __init__(self, api_key: str, currency: str, logger: Logger): - super().__init__(api_key, currency, logger) - self.__geocoder = Geocoder() + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: bool, logger: Logger): + super().__init__(api_key, currency, proxy, ca_cert,throttle, logger) + self.__geocoder = Geocoder(proxy,ca_cert) self.__regex_amenity_id = re.compile(r'^([a-z0-9]+_)+([0-9]+)_') @staticmethod @@ -359,8 +359,9 @@ def __determine_city_and_neighborhood(self, listing: dict, geography: dict): if reverse_geo_address['city'] in [search_city, city, localized_city] or self.__geocoder.is_city(reverse_geo_address['city'], reverse_geo_address['country']): return reverse_geo_address['city'], localized_neighborhood - if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']): - return city or localized_city, neighborhood + if reverse_geo_address : + if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']): + return city or localized_city, neighborhood return city, neighborhood @@ -391,8 +392,8 @@ def __get_price_key(pricing) -> str: def __get_price_rate(pricing) -> int | None: if pricing: price_key = Pdp.__get_price_key(pricing) - return int(pricing['structuredStayDisplayPrice']['primaryLine'][price_key].lstrip('$').replace(',', '')) - + res=pricing['structuredStayDisplayPrice']['primaryLine'][price_key].replace('\xa0',' ') + return int ( ''.join(filter(str.isdigit, res) ) ) return None @staticmethod @@ -406,16 +407,17 @@ def __get_rate_type(pricing) -> str | None: def __get_total_price(pricing) -> int | None: if pricing['structuredStayDisplayPrice']['secondaryLine']: price = pricing['structuredStayDisplayPrice']['secondaryLine']['price'] - amount_match = re.match(r'\$([\w,]+) total', price) else: price_key = Pdp.__get_price_key(pricing) price = pricing['structuredStayDisplayPrice']['primaryLine'][price_key] - amount_match = re.match(r'\$([\w,]+)', price) - - if not amount_match: - raise ValueError('No amount match found for price: %s' % price) + + amount_match = ''.join(filter(str.isdigit, price) ) - return int(amount_match[1].replace(',', '')) + if amount_match =='': + #raise ValueError('No amount match found for price: %s' % price) + return None + else: + return int(amount_match) @staticmethod def __html_to_text(html: str) -> str: diff --git a/stl/endpoint/reviews.py b/stl/endpoint/reviews.py index 0dd4ef1..55377aa 100644 --- a/stl/endpoint/reviews.py +++ b/stl/endpoint/reviews.py @@ -34,13 +34,16 @@ def __get_reviews_batch(self, listing_id: str, limit: int, offset: int): else: n_reviews_total = 0 - reviews = [{ - 'comments': r['comments'], - 'created_at': r['createdAt'], - 'language': r['language'], - 'rating': r['rating'], - 'response': r['response'], - } for r in pdp_reviews['reviews']] + if pdp_reviews!=None: + reviews = [{ + 'comments': r['comments'], + 'created_at': r['createdAt'], + 'language': r['language'], + 'rating': r['rating'], + 'response': r['response'], + } for r in pdp_reviews['reviews']] + else: + reviews=[] return reviews, n_reviews_total diff --git a/stl/geo/geocode.py b/stl/geo/geocode.py index 90ac6d0..f6aa5aa 100644 --- a/stl/geo/geocode.py +++ b/stl/geo/geocode.py @@ -4,14 +4,27 @@ from geopy.extra.rate_limiter import RateLimiter from random import randint +import ssl +import geopy.geocoders + + class Geocoder: - def __init__(self) -> None: + def __init__(self,proxy,ca_cert) -> None: gmaps_api_key = os.environ.get('GMAPS_API_KEY') self.__gmaps = GoogleV3(api_key=gmaps_api_key) if gmaps_api_key else None - user_agent = 'stl-scraper-{}'.format(randint(1, 10000)) - self.__geolocator = Nominatim(user_agent=user_agent) + user_agent = 'stl-scraper-test-{}'.format(randint(1, 100000)) + + proxy = {'http': proxy, + 'https': proxy} + + if ca_cert: + ctx = ssl.create_default_context(cafile=ca_cert) + geopy.geocoders.options.default_ssl_context = ctx + geopy.geocoders.options.default_timeout = 2 + + self.__geolocator = Nominatim(user_agent=(user_agent),proxies=proxy) self.__osm_reverse_geo = RateLimiter(self.__geolocator.reverse, min_delay_seconds=1) def is_city(self, name: str, country: str): @@ -27,20 +40,26 @@ def is_city(self, name: str, country: str): def reverse(self, lat: float, lon: float) -> dict | bool: """Tries OSM reverse geocoder (Nomatim) first. If it fails, tries Google Maps reverse geocoder (untested).""" # Try OSM - address = self.__osm_reverse_geo((lat, lon), language='en').raw['address'] - if 'city' in address: - return address - if 'town' in address: - address['city'] = address['town'] - return address - if 'state' in address: - address['city'] = address['state'] - return address + try: + address = self.__osm_reverse_geo((lat, lon), language='en').raw['address'] + if 'city' in address: + return address + if 'town' in address: + address['city'] = address['town'] + return address + if 'state' in address: + address['city'] = address['state'] + return address + except: + pass # Else try google maps if self.__gmaps: - address = self.__gmaps.reverse((lat, lon), language='en') - if 'city' in address: - return address + try: + address = self.__gmaps.reverse((lat, lon), language='en') + if 'city' in address: + return address + except: + pass return False diff --git a/stl/persistence/csv.py b/stl/persistence/csv.py index b549696..b0d5e0b 100644 --- a/stl/persistence/csv.py +++ b/stl/persistence/csv.py @@ -8,8 +8,12 @@ class Csv(PersistenceInterface): def __init__(self, csv_path: str): self.__csv_path = csv_path - def save(self, query: str, listings: list): - with open(self.__csv_path, 'w', encoding='utf-8', newline='') as csvfile: + def save(self, query: str, listings: list,continuous:bool=False): + if continuous==False: + action='w' + else: + action='a' + with open(self.__csv_path, action, encoding='utf-8', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys()) writer.writeheader() writer.writerows(listings) diff --git a/stl/persistence/elastic.py b/stl/persistence/elastic.py index ff6b391..fbe8498 100644 --- a/stl/persistence/elastic.py +++ b/stl/persistence/elastic.py @@ -128,7 +128,7 @@ def mark_deleted(self, listing_id: str): """Mark a listing as deleted by setting the 'deleted' field to True.""" self.__es.update(index=self.__index, id=listing_id, doc={'deleted': True}) - def save(self, query: str, listings: list): + def save(self, query: str, listings: list,continuous:bool=False): """Bulk save listings by upsert.""" bulk(self.__es, index=self.__index, actions=[{ '_op_type': 'update', diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index c7d044d..c81065e 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -14,14 +14,24 @@ from stl.persistence.elastic import Elastic from stl.persistence import PersistenceInterface - +def xstr(s): + return '' if s is None else str(s) + +def sign_currency(currency): + if currency=='EUR': + res='€' + elif currency=='USD': + res='$' + else: + res=currency + return res class AirbnbScraperInterface: def run(self, *args, **kwargs): raise NotImplementedError() class AirbnbSearchScraper(AirbnbScraperInterface): - def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: PersistenceInterface, logger: Logger): + def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: PersistenceInterface,currency: str,logger: Logger): self.__logger = logger self.__explore = explore self.__geography = {} @@ -29,6 +39,7 @@ def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: Pe self.__pdp = pdp self.__persistence = persistence self.__reviews = reviews + self.__currency = currency def run(self, query: str, params: dict): listings = [] @@ -42,6 +53,7 @@ def run(self, query: str, params: dict): page = 1 data_cache = {} while pagination.get('hasNextPage'): + listings_continue =[] self.__logger.info('Searching page {} for {}'.format(page, query)) listing_ids = self.__pdp.collect_listings_from_sections(data, self.__geography, data_cache) for listing_id in listing_ids: # request each property page @@ -52,20 +64,23 @@ def run(self, query: str, params: dict): n_listings += 1 reviews = self.__reviews.get_reviews(listing_id) listing = self.__pdp.get_listing(listing_id, data_cache, self.__geography, reviews) - - msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( - '#' + str(n_listings), - listing['city'], - '${} {}'.format(listing['price_rate'], listing['price_rate_type']), - str(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', - '{:.2f}ba'.format(listing['bathrooms']), - listing['room_and_property_type'], - '- {} -'.format(listing['neighborhood']) if listing['neighborhood'] else '', - listing['name'], - listing['url'] - ) - self.__logger.info(msg) - listings.append(listing) + try: + msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( + '#' + str(n_listings), + xstr(listing['city']), + '{}{} {}'.format(sign_currency(self.__currency), xstr(listing['price_rate']), xstr(listing['price_rate_type'])), + xstr(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', + '{:.2f}ba'.format(listing['bathrooms'] if listing['bathrooms'] else 0), + xstr(listing['room_and_property_type']), + '- {} -'.format(xstr(listing['neighborhood'])), + xstr(listing['name']), + xstr(listing['url']) + ) + self.__logger.info(msg) + listings.append(listing) + listings_continue.append(listing) + except: + self.__logger.error('ERROR_TO_HANDLE -- '+str(listing['id'])) self.__add_search_params(params, url) items_offset = pagination['itemsOffset'] @@ -73,8 +88,9 @@ def run(self, query: str, params: dict): url = self.__explore.get_url(query, params) data, pagination = self.__explore.search(url) page += 1 + self.__persistence.save(query, listings_continue,continuous=True) - self.__persistence.save(query, listings) + #self.__persistence.save(query, listings) self.__logger.info('Got data for {} listings.'.format(n_listings)) @staticmethod