From eca81a3f2d5360de648987218dbfd53cebdb0da5 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 01:39:41 +0100 Subject: [PATCH 01/13] add support for proxy --- .env.dist | 2 ++ stl/command/stl_command.py | 24 +++++++++++++++++------- stl/endpoint/base_endpoint.py | 10 ++++++++-- stl/endpoint/pdp.py | 4 ++-- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/.env.dist b/.env.dist index f7d20f5..e39fea9 100644 --- a/.env.dist +++ b/.env.dist @@ -1,6 +1,8 @@ # # Basic settings # +#PROXY="http://localhost:8080" +#IGNORE_CERT=1 # Airbnb client key AIRBNB_API_KEY=d306zoyjsyarp7ifhu67rjxn52tv0t20 diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index bab6ab8..a92de2f 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -81,14 +81,20 @@ def execute(self): scraper.run(source, self.__args.get('--updated')) elif self.__args.get('data'): - pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, self.__logger) + ignore_cert = os.getenv('IGNORE_CERT', False) + if ignore_cert != False: + ignore_cert = True + pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, self.__logger) print(json.dumps(pdp.get_raw_listing(self.__args.get('')))) elif self.__args.get('pricing'): listing_id = self.__args.get('') checkin = self.__args.get('--checkin') checkout = self.__args.get('--checkout') - pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, self.__logger) + ignore_cert = os.getenv('IGNORE_CERT', False) + if ignore_cert != False: + ignore_cert = True + pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) @@ -103,14 +109,18 @@ def __create_scraper( ) -> AirbnbScraperInterface: """Create scraper of given type using given parameters.""" api_key = os.getenv('AIRBNB_API_KEY') + proxy = os.getenv('PROXY', None) + ignore_cert = os.getenv('IGNORE_CERT', False) + if ignore_cert != False: + ignore_cert = True if scraper_type == 'search': - explore = Explore(api_key, currency, self.__logger) - pdp = Pdp(api_key, currency, self.__logger) - reviews = Reviews(api_key, currency, self.__logger) + explore = Explore(api_key, currency, proxy, ignore_cert, self.__logger) + pdp = Pdp(api_key, currency, proxy, ignore_cert, self.__logger) + reviews = Reviews(api_key, currency, proxy, ignore_cert, self.__logger) return AirbnbSearchScraper(explore, pdp, reviews, persistence, self.__logger) elif scraper_type == 'calendar': - pricing = Pricing(api_key, currency, self.__logger) - calendar = Calendar(api_key, currency, self.__logger, pricing) + pricing = Pricing(api_key, currency, proxy, ignore_cert, self.__logger) + calendar = Calendar(api_key, currency, proxy, ignore_cert, self.__logger, pricing) return AirbnbCalendarScraper(calendar, persistence, self.__logger) else: raise RuntimeError('Unknown scraper type: %s' % scraper_type) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index 495a375..c272a9f 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -9,16 +9,22 @@ from stl.exception.api import ApiException, ForbiddenException +import urllib3 +urllib3.disable_warnings() + class BaseEndpoint(ABC): API_PATH = None SOURCE = 'airbnb' - def __init__(self, api_key: str, currency: str, logger: Logger, locale: str = 'en'): + def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, logger: Logger, locale: str = 'en'): self._api_key = api_key self._currency = currency self._locale = locale self._logger = logger + self.proxy = {'http': proxy, + 'https': proxy} + self.verify_cert = not ignore_cert @staticmethod def build_airbnb_url(path: str, query=None): @@ -37,7 +43,7 @@ def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: while attempts < max_attempts: sleep(randint(0, 2)) # do a little throttling attempts += 1 - response = requests.request(method, url, headers=headers, data=data) + response = requests.request(method, url, headers=headers, data=data, proxies=self.proxy, verify=self.verify_cert) response_json = response.json() errors = response_json.get('errors') if not errors: diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index 19cf1b3..9cd3ba5 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -70,8 +70,8 @@ class Pdp(BaseEndpoint): SECTION_NAMES = ['amenities', 'description', 'host_profile', 'location', 'policies'] - def __init__(self, api_key: str, currency: str, logger: Logger): - super().__init__(api_key, currency, logger) + def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, logger: Logger): + super().__init__(api_key, currency, proxy, ignore_cert, logger) self.__geocoder = Geocoder() self.__regex_amenity_id = re.compile(r'^([a-z0-9]+_)+([0-9]+)_') From cda5c4a593395f05de46af80dd7aff98ee87bb85 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 02:32:21 +0100 Subject: [PATCH 02/13] - bypass throttle - try catch on msg building, NoneType to be handled - price parsing fix attempt --- stl/command/stl_command.py | 24 +++++++++++++++--------- stl/endpoint/base_endpoint.py | 12 +++++++----- stl/endpoint/pdp.py | 8 ++++++-- stl/scraper/airbnb_scraper.py | 30 ++++++++++++++++-------------- 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index a92de2f..bf6e4d8 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -81,10 +81,13 @@ def execute(self): scraper.run(source, self.__args.get('--updated')) elif self.__args.get('data'): - ignore_cert = os.getenv('IGNORE_CERT', False) - if ignore_cert != False: + ignore_cert = os.getenv('IGNORE_CERT',False) + if ignore_cert != False and ignore_cert!=0: ignore_cert = True - pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, self.__logger) + throttle = os.getenv('THROTTLE', True) + if throttle != True and throttle!=1: + throttle = False + pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) print(json.dumps(pdp.get_raw_listing(self.__args.get('')))) elif self.__args.get('pricing'): @@ -94,7 +97,7 @@ def execute(self): ignore_cert = os.getenv('IGNORE_CERT', False) if ignore_cert != False: ignore_cert = True - pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, self.__logger) + pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) @@ -113,14 +116,17 @@ def __create_scraper( ignore_cert = os.getenv('IGNORE_CERT', False) if ignore_cert != False: ignore_cert = True + throttle = os.getenv('THROTTLE', True) + if throttle != True and throttle!=1: + throttle = False if scraper_type == 'search': - explore = Explore(api_key, currency, proxy, ignore_cert, self.__logger) - pdp = Pdp(api_key, currency, proxy, ignore_cert, self.__logger) - reviews = Reviews(api_key, currency, proxy, ignore_cert, self.__logger) + explore = Explore(api_key, currency, proxy, ignore_cert, throttle, self.__logger) + pdp = Pdp(api_key, currency, proxy, ignore_cert, throttle, self.__logger) + reviews = Reviews(api_key, currency, proxy, ignore_cert, throttle, self.__logger) return AirbnbSearchScraper(explore, pdp, reviews, persistence, self.__logger) elif scraper_type == 'calendar': - pricing = Pricing(api_key, currency, proxy, ignore_cert, self.__logger) - calendar = Calendar(api_key, currency, proxy, ignore_cert, self.__logger, pricing) + pricing = Pricing(api_key, currency, proxy, ignore_cert, throttle, self.__logger) + calendar = Calendar(api_key, currency, proxy, ignore_cert, throttle, self.__logger, pricing) return AirbnbCalendarScraper(calendar, persistence, self.__logger) else: raise RuntimeError('Unknown scraper type: %s' % scraper_type) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index c272a9f..f3705ed 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -17,14 +17,15 @@ class BaseEndpoint(ABC): API_PATH = None SOURCE = 'airbnb' - def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, logger: Logger, locale: str = 'en'): + def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle:bool, logger: Logger, locale: str = 'en'): self._api_key = api_key self._currency = currency self._locale = locale self._logger = logger - self.proxy = {'http': proxy, + self._proxy = {'http': proxy, 'https': proxy} - self.verify_cert = not ignore_cert + self._throttle=throttle + self._verify_cert = not ignore_cert @staticmethod def build_airbnb_url(path: str, query=None): @@ -41,9 +42,10 @@ def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: headers = {'x-airbnb-api-key': self._api_key} max_attempts = 3 while attempts < max_attempts: - sleep(randint(0, 2)) # do a little throttling + if self._throttle: + sleep(randint(0, 2)) # do a little throttling attempts += 1 - response = requests.request(method, url, headers=headers, data=data, proxies=self.proxy, verify=self.verify_cert) + response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._verify_cert) response_json = response.json() errors = response_json.get('errors') if not errors: diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index 9cd3ba5..d9514ba 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -70,8 +70,8 @@ class Pdp(BaseEndpoint): SECTION_NAMES = ['amenities', 'description', 'host_profile', 'location', 'policies'] - def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, logger: Logger): - super().__init__(api_key, currency, proxy, ignore_cert, logger) + def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle: bool, logger: Logger): + super().__init__(api_key, currency, proxy, ignore_cert,throttle, logger) self.__geocoder = Geocoder() self.__regex_amenity_id = re.compile(r'^([a-z0-9]+_)+([0-9]+)_') @@ -391,8 +391,12 @@ def __get_price_key(pricing) -> str: def __get_price_rate(pricing) -> int | None: if pricing: price_key = Pdp.__get_price_key(pricing) + print(pricing['structuredStayDisplayPrice']['primaryLine'][price_key]) return int(pricing['structuredStayDisplayPrice']['primaryLine'][price_key].lstrip('$').replace(',', '')) + #res=pricing['structuredStayDisplayPrice']['primaryLine'][price_key].replace('\xa0',' ') + #print + #return int(re.search(r'\d+',res).group()) return None @staticmethod diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index c7d044d..a35bbe7 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -52,20 +52,22 @@ def run(self, query: str, params: dict): n_listings += 1 reviews = self.__reviews.get_reviews(listing_id) listing = self.__pdp.get_listing(listing_id, data_cache, self.__geography, reviews) - - msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( - '#' + str(n_listings), - listing['city'], - '${} {}'.format(listing['price_rate'], listing['price_rate_type']), - str(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', - '{:.2f}ba'.format(listing['bathrooms']), - listing['room_and_property_type'], - '- {} -'.format(listing['neighborhood']) if listing['neighborhood'] else '', - listing['name'], - listing['url'] - ) - self.__logger.info(msg) - listings.append(listing) + try: + msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( + '#' + str(n_listings), + listing['city'], + '${} {}'.format(listing['price_rate'], listing['price_rate_type']), + str(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', + '{:.2f}ba'.format(listing['bathrooms']), + listing['room_and_property_type'], + '- {} -'.format(listing['neighborhood']) if listing['neighborhood'] else '', + listing['name'], + listing['url'] + ) + self.__logger.info(msg) + listings.append(listing) + except: + self.__logger.error('ERROR_TO_HANDLE -- '+listing['url']+' -- '+listing) self.__add_search_params(params, url) items_offset = pagination['itemsOffset'] From ae1eaa12eaa1bd2538b36188584712651d1ea577 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 02:53:07 +0100 Subject: [PATCH 03/13] fix parsing prices --- stl/command/stl_command.py | 1 + stl/endpoint/pdp.py | 12 ++++-------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index bf6e4d8..7b91a13 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -117,6 +117,7 @@ def __create_scraper( if ignore_cert != False: ignore_cert = True throttle = os.getenv('THROTTLE', True) + if throttle != True and throttle!=1: throttle = False if scraper_type == 'search': diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index d9514ba..a62b3fb 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -391,12 +391,8 @@ def __get_price_key(pricing) -> str: def __get_price_rate(pricing) -> int | None: if pricing: price_key = Pdp.__get_price_key(pricing) - print(pricing['structuredStayDisplayPrice']['primaryLine'][price_key]) - return int(pricing['structuredStayDisplayPrice']['primaryLine'][price_key].lstrip('$').replace(',', '')) - - #res=pricing['structuredStayDisplayPrice']['primaryLine'][price_key].replace('\xa0',' ') - #print - #return int(re.search(r'\d+',res).group()) + res=pricing['structuredStayDisplayPrice']['primaryLine'][price_key].replace('\xa0',' ') + return int ( ''.join(filter(str.isdigit, res) ) ) return None @staticmethod @@ -410,11 +406,11 @@ def __get_rate_type(pricing) -> str | None: def __get_total_price(pricing) -> int | None: if pricing['structuredStayDisplayPrice']['secondaryLine']: price = pricing['structuredStayDisplayPrice']['secondaryLine']['price'] - amount_match = re.match(r'\$([\w,]+) total', price) else: price_key = Pdp.__get_price_key(pricing) price = pricing['structuredStayDisplayPrice']['primaryLine'][price_key] - amount_match = re.match(r'\$([\w,]+)', price) + + amount_match = int ( ''.join(filter(str.isdigit, price) ) ) if not amount_match: raise ValueError('No amount match found for price: %s' % price) From 26d33a8b15626d1aeab462b523ab528a70c39985 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 02:54:13 +0100 Subject: [PATCH 04/13] try catch msg, str(dict()) --- stl/scraper/airbnb_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index a35bbe7..9befa43 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -67,7 +67,7 @@ def run(self, query: str, params: dict): self.__logger.info(msg) listings.append(listing) except: - self.__logger.error('ERROR_TO_HANDLE -- '+listing['url']+' -- '+listing) + self.__logger.error('ERROR_TO_HANDLE -- '+listing['url']+' -- '+str(listing)) self.__add_search_params(params, url) items_offset = pagination['itemsOffset'] From e9a6e44314285f977c8da69bea65dadc8d16b5fb Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 15:17:52 +0100 Subject: [PATCH 05/13] throttle missing in pricing --- .env.dist | 2 ++ stl/command/stl_command.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.env.dist b/.env.dist index e39fea9..7544547 100644 --- a/.env.dist +++ b/.env.dist @@ -4,6 +4,8 @@ #PROXY="http://localhost:8080" #IGNORE_CERT=1 +#THROTTLE=1 + # Airbnb client key AIRBNB_API_KEY=d306zoyjsyarp7ifhu67rjxn52tv0t20 diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index 7b91a13..c5be4d4 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -97,6 +97,9 @@ def execute(self): ignore_cert = os.getenv('IGNORE_CERT', False) if ignore_cert != False: ignore_cert = True + throttle = os.getenv('THROTTLE', True) + if throttle != True and throttle!=1: + throttle = False pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) @@ -117,7 +120,6 @@ def __create_scraper( if ignore_cert != False: ignore_cert = True throttle = os.getenv('THROTTLE', True) - if throttle != True and throttle!=1: throttle = False if scraper_type == 'search': From 046b88c05249aa99e6a2e3e954269f6e0465ec07 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Tue, 7 Mar 2023 15:23:16 +0100 Subject: [PATCH 06/13] change throttle boolean to int --- .env.dist | 3 ++- stl/command/stl_command.py | 12 +++--------- stl/endpoint/base_endpoint.py | 5 ++--- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.env.dist b/.env.dist index 7544547..9acf699 100644 --- a/.env.dist +++ b/.env.dist @@ -4,7 +4,8 @@ #PROXY="http://localhost:8080" #IGNORE_CERT=1 -#THROTTLE=1 +#throttle in s +#THROTTLE=2 # Airbnb client key AIRBNB_API_KEY=d306zoyjsyarp7ifhu67rjxn52tv0t20 diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index c5be4d4..1edda01 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -84,9 +84,7 @@ def execute(self): ignore_cert = os.getenv('IGNORE_CERT',False) if ignore_cert != False and ignore_cert!=0: ignore_cert = True - throttle = os.getenv('THROTTLE', True) - if throttle != True and throttle!=1: - throttle = False + throttle = int(os.getenv('THROTTLE', 2)) pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) print(json.dumps(pdp.get_raw_listing(self.__args.get('')))) @@ -97,9 +95,7 @@ def execute(self): ignore_cert = os.getenv('IGNORE_CERT', False) if ignore_cert != False: ignore_cert = True - throttle = os.getenv('THROTTLE', True) - if throttle != True and throttle!=1: - throttle = False + throttle = int(os.getenv('THROTTLE', 2)) pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) @@ -119,9 +115,7 @@ def __create_scraper( ignore_cert = os.getenv('IGNORE_CERT', False) if ignore_cert != False: ignore_cert = True - throttle = os.getenv('THROTTLE', True) - if throttle != True and throttle!=1: - throttle = False + throttle = int(os.getenv('THROTTLE', 2)) if scraper_type == 'search': explore = Explore(api_key, currency, proxy, ignore_cert, throttle, self.__logger) pdp = Pdp(api_key, currency, proxy, ignore_cert, throttle, self.__logger) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index f3705ed..ee0ea29 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -17,7 +17,7 @@ class BaseEndpoint(ABC): API_PATH = None SOURCE = 'airbnb' - def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle:bool, logger: Logger, locale: str = 'en'): + def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle:int, logger: Logger, locale: str = 'en'): self._api_key = api_key self._currency = currency self._locale = locale @@ -42,8 +42,7 @@ def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: headers = {'x-airbnb-api-key': self._api_key} max_attempts = 3 while attempts < max_attempts: - if self._throttle: - sleep(randint(0, 2)) # do a little throttling + sleep(randint(0,self._throttle)) # do a little throttling attempts += 1 response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._verify_cert) response_json = response.json() From 2577c55525e94445c3921b94c2b5d6f7cd8b1302 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Thu, 9 Mar 2023 13:57:13 +0100 Subject: [PATCH 07/13] fix NoneType+ persist on each page --- stl/geo/geocode.py | 30 ++++++++++++++++++------------ stl/persistence/csv.py | 6 ++++++ stl/scraper/airbnb_scraper.py | 21 +++++++++++++-------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/stl/geo/geocode.py b/stl/geo/geocode.py index 90ac6d0..4c5c62d 100644 --- a/stl/geo/geocode.py +++ b/stl/geo/geocode.py @@ -27,20 +27,26 @@ def is_city(self, name: str, country: str): def reverse(self, lat: float, lon: float) -> dict | bool: """Tries OSM reverse geocoder (Nomatim) first. If it fails, tries Google Maps reverse geocoder (untested).""" # Try OSM - address = self.__osm_reverse_geo((lat, lon), language='en').raw['address'] - if 'city' in address: - return address - if 'town' in address: - address['city'] = address['town'] - return address - if 'state' in address: - address['city'] = address['state'] - return address + try: + address = self.__osm_reverse_geo((lat, lon), language='en').raw['address'] + if 'city' in address: + return address + if 'town' in address: + address['city'] = address['town'] + return address + if 'state' in address: + address['city'] = address['state'] + return address + except: + pass # Else try google maps if self.__gmaps: - address = self.__gmaps.reverse((lat, lon), language='en') - if 'city' in address: - return address + try: + address = self.__gmaps.reverse((lat, lon), language='en') + if 'city' in address: + return address + except: + pass return False diff --git a/stl/persistence/csv.py b/stl/persistence/csv.py index b549696..0548e7e 100644 --- a/stl/persistence/csv.py +++ b/stl/persistence/csv.py @@ -13,3 +13,9 @@ def save(self, query: str, listings: list): writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys()) writer.writeheader() writer.writerows(listings) + + def save_continue(self, query: str, listings: list): + with open(self.__csv_path, 'aw', encoding='utf-8', newline='') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys()) + writer.writeheader() + writer.writerows(listings) diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index 9befa43..2bbbc72 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -14,6 +14,8 @@ from stl.persistence.elastic import Elastic from stl.persistence import PersistenceInterface +def xstr(s): + return '' if s is None else str(s) class AirbnbScraperInterface: def run(self, *args, **kwargs): @@ -42,6 +44,7 @@ def run(self, query: str, params: dict): page = 1 data_cache = {} while pagination.get('hasNextPage'): + listings_continue =[] self.__logger.info('Searching page {} for {}'.format(page, query)) listing_ids = self.__pdp.collect_listings_from_sections(data, self.__geography, data_cache) for listing_id in listing_ids: # request each property page @@ -55,17 +58,18 @@ def run(self, query: str, params: dict): try: msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( '#' + str(n_listings), - listing['city'], - '${} {}'.format(listing['price_rate'], listing['price_rate_type']), - str(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', - '{:.2f}ba'.format(listing['bathrooms']), - listing['room_and_property_type'], - '- {} -'.format(listing['neighborhood']) if listing['neighborhood'] else '', - listing['name'], - listing['url'] + xstr(listing['city']), + '${} {}'.format(xstr(listing['price_rate']), xstr(listing['price_rate_type'])), + xstr(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', + '{:.2f}ba'.format(listing['bathrooms'] if listing['bathrooms'] else 0), + xstr(listing['room_and_property_type']), + '- {} -'.format(xstr(listing['neighborhood'])), + xstr(listing['name']), + xstr(listing['url']) ) self.__logger.info(msg) listings.append(listing) + listings_continue.append(listing) except: self.__logger.error('ERROR_TO_HANDLE -- '+listing['url']+' -- '+str(listing)) @@ -75,6 +79,7 @@ def run(self, query: str, params: dict): url = self.__explore.get_url(query, params) data, pagination = self.__explore.search(url) page += 1 + self.__persistence.save(query, listings_continue) self.__persistence.save(query, listings) self.__logger.info('Got data for {} listings.'.format(n_listings)) From 5519ded43f1a89399ff2451540204d5ec641b33d Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Thu, 9 Mar 2023 14:30:56 +0100 Subject: [PATCH 08/13] fix __get_total_price --- stl/endpoint/pdp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index a62b3fb..a639831 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -415,7 +415,7 @@ def __get_total_price(pricing) -> int | None: if not amount_match: raise ValueError('No amount match found for price: %s' % price) - return int(amount_match[1].replace(',', '')) + return int(amount_match) @staticmethod def __html_to_text(html: str) -> str: From c3e1a64ed790d52a8923e397ae6fbba0b1432f7a Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Fri, 10 Mar 2023 11:56:57 +0100 Subject: [PATCH 09/13] fix to continuous save --- stl/endpoint/base_endpoint.py | 3 +-- stl/endpoint/explore.py | 3 ++- stl/endpoint/pdp.py | 5 +++-- stl/persistence/csv.py | 14 ++++++-------- stl/persistence/elastic.py | 2 +- stl/scraper/airbnb_scraper.py | 6 +++--- 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index ee0ea29..f74ab1b 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -37,10 +37,9 @@ def build_airbnb_url(path: str, query=None): def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: if data is None: data = {} - attempts = 0 headers = {'x-airbnb-api-key': self._api_key} - max_attempts = 3 + max_attempts = 5 while attempts < max_attempts: sleep(randint(0,self._throttle)) # do a little throttling attempts += 1 diff --git a/stl/endpoint/explore.py b/stl/endpoint/explore.py index b7ace5c..208108a 100644 --- a/stl/endpoint/explore.py +++ b/stl/endpoint/explore.py @@ -16,7 +16,7 @@ def get_url(self, search_string: str, params: dict = None): 'request': { 'metadataOnly': False, 'version': '1.7.9', - 'itemsPerGrid': 20, + 'itemsPerGrid': 200, 'tabId': 'home_tab', 'refinementPaths': ['/homes'], 'source': 'structured_search_input_header', @@ -52,6 +52,7 @@ def get_url(self, search_string: str, params: dict = None): def search(self, url: str): data = self._api_request(url) + print(data['data']['dora']['exploreV3']['metadata']['paginationMetadata']) pagination = data['data']['dora']['exploreV3']['metadata']['paginationMetadata'] return data, pagination diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index a639831..1cb5392 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -359,8 +359,9 @@ def __determine_city_and_neighborhood(self, listing: dict, geography: dict): if reverse_geo_address['city'] in [search_city, city, localized_city] or self.__geocoder.is_city(reverse_geo_address['city'], reverse_geo_address['country']): return reverse_geo_address['city'], localized_neighborhood - if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']): - return city or localized_city, neighborhood + if reverse_geo_address : + if self.__geocoder.is_city((city or localized_city), reverse_geo_address['country']): + return city or localized_city, neighborhood return city, neighborhood diff --git a/stl/persistence/csv.py b/stl/persistence/csv.py index 0548e7e..b0d5e0b 100644 --- a/stl/persistence/csv.py +++ b/stl/persistence/csv.py @@ -8,14 +8,12 @@ class Csv(PersistenceInterface): def __init__(self, csv_path: str): self.__csv_path = csv_path - def save(self, query: str, listings: list): - with open(self.__csv_path, 'w', encoding='utf-8', newline='') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys()) - writer.writeheader() - writer.writerows(listings) - - def save_continue(self, query: str, listings: list): - with open(self.__csv_path, 'aw', encoding='utf-8', newline='') as csvfile: + def save(self, query: str, listings: list,continuous:bool=False): + if continuous==False: + action='w' + else: + action='a' + with open(self.__csv_path, action, encoding='utf-8', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=listings[0].keys()) writer.writeheader() writer.writerows(listings) diff --git a/stl/persistence/elastic.py b/stl/persistence/elastic.py index ff6b391..fbe8498 100644 --- a/stl/persistence/elastic.py +++ b/stl/persistence/elastic.py @@ -128,7 +128,7 @@ def mark_deleted(self, listing_id: str): """Mark a listing as deleted by setting the 'deleted' field to True.""" self.__es.update(index=self.__index, id=listing_id, doc={'deleted': True}) - def save(self, query: str, listings: list): + def save(self, query: str, listings: list,continuous:bool=False): """Bulk save listings by upsert.""" bulk(self.__es, index=self.__index, actions=[{ '_op_type': 'update', diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index 2bbbc72..66b28c3 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -71,7 +71,7 @@ def run(self, query: str, params: dict): listings.append(listing) listings_continue.append(listing) except: - self.__logger.error('ERROR_TO_HANDLE -- '+listing['url']+' -- '+str(listing)) + self.__logger.error('ERROR_TO_HANDLE -- '+str(listing['id'])) self.__add_search_params(params, url) items_offset = pagination['itemsOffset'] @@ -79,9 +79,9 @@ def run(self, query: str, params: dict): url = self.__explore.get_url(query, params) data, pagination = self.__explore.search(url) page += 1 - self.__persistence.save(query, listings_continue) + self.__persistence.save(query, listings_continue,continuous=True) - self.__persistence.save(query, listings) + #self.__persistence.save(query, listings) self.__logger.info('Got data for {} listings.'.format(n_listings)) @staticmethod From b55d37c4dfe907f9f68117c5a3e94ad28f1ef202 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Sat, 11 Mar 2023 20:15:59 +0100 Subject: [PATCH 10/13] fix various bug+ change ignore_crt by custom certificate+ print user currency in message --- .env.dist | 2 +- stl/command/stl_command.py | 18 +++++++++--------- stl/endpoint/base_endpoint.py | 22 +++++++++++++--------- stl/endpoint/explore.py | 4 ++-- stl/endpoint/pdp.py | 19 ++++++++++--------- stl/endpoint/reviews.py | 17 ++++++++++------- stl/geo/geocode.py | 14 ++++++++++++-- stl/scraper/airbnb_scraper.py | 13 +++++++++++-- 8 files changed, 68 insertions(+), 41 deletions(-) diff --git a/.env.dist b/.env.dist index 9acf699..5f76bab 100644 --- a/.env.dist +++ b/.env.dist @@ -2,7 +2,7 @@ # Basic settings # #PROXY="http://localhost:8080" -#IGNORE_CERT=1 +#CA_CERT="/home/user/customproxy_ca.crt" #throttle in s #THROTTLE=2 diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index 1edda01..dd9d7b8 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -112,18 +112,18 @@ def __create_scraper( """Create scraper of given type using given parameters.""" api_key = os.getenv('AIRBNB_API_KEY') proxy = os.getenv('PROXY', None) - ignore_cert = os.getenv('IGNORE_CERT', False) - if ignore_cert != False: - ignore_cert = True + ca_cert = os.getenv('CA_CERT', True) + #if ignore_cert != False: + # ignore_cert = True throttle = int(os.getenv('THROTTLE', 2)) if scraper_type == 'search': - explore = Explore(api_key, currency, proxy, ignore_cert, throttle, self.__logger) - pdp = Pdp(api_key, currency, proxy, ignore_cert, throttle, self.__logger) - reviews = Reviews(api_key, currency, proxy, ignore_cert, throttle, self.__logger) - return AirbnbSearchScraper(explore, pdp, reviews, persistence, self.__logger) + explore = Explore(api_key, currency, proxy, ca_cert, throttle, self.__logger) + pdp = Pdp(api_key, currency, proxy, ca_cert, throttle, self.__logger) + reviews = Reviews(api_key, currency, proxy, ca_cert, throttle, self.__logger) + return AirbnbSearchScraper(explore, pdp, reviews, persistence,currency, self.__logger) elif scraper_type == 'calendar': - pricing = Pricing(api_key, currency, proxy, ignore_cert, throttle, self.__logger) - calendar = Calendar(api_key, currency, proxy, ignore_cert, throttle, self.__logger, pricing) + pricing = Pricing(api_key, currency, proxy, ca_cert, throttle, self.__logger) + calendar = Calendar(api_key, currency, proxy, ca_cert, throttle, self.__logger, pricing) return AirbnbCalendarScraper(calendar, persistence, self.__logger) else: raise RuntimeError('Unknown scraper type: %s' % scraper_type) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index f74ab1b..17cf132 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -17,7 +17,7 @@ class BaseEndpoint(ABC): API_PATH = None SOURCE = 'airbnb' - def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle:int, logger: Logger, locale: str = 'en'): + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle:int, logger: Logger, locale: str = 'en'): self._api_key = api_key self._currency = currency self._locale = locale @@ -25,7 +25,7 @@ def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, t self._proxy = {'http': proxy, 'https': proxy} self._throttle=throttle - self._verify_cert = not ignore_cert + self._ca_cert = ca_cert @staticmethod def build_airbnb_url(path: str, query=None): @@ -43,13 +43,17 @@ def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: while attempts < max_attempts: sleep(randint(0,self._throttle)) # do a little throttling attempts += 1 - response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._verify_cert) - response_json = response.json() - errors = response_json.get('errors') - if not errors: - return response_json - - self.__handle_api_error(url, errors) + response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._ca_cert) + if response.text=='Proxy server error': + errors='Proxy server error' + #self.__handle_api_error(url, errors) + else: + response_json = response.json() + errors = response_json.get('errors') + if not errors: + return response_json + else: + self.__handle_api_error(url, errors) raise ApiException(['Could not complete API {} request to "{}"'.format(method, url)]) diff --git a/stl/endpoint/explore.py b/stl/endpoint/explore.py index 208108a..9c21354 100644 --- a/stl/endpoint/explore.py +++ b/stl/endpoint/explore.py @@ -16,7 +16,7 @@ def get_url(self, search_string: str, params: dict = None): 'request': { 'metadataOnly': False, 'version': '1.7.9', - 'itemsPerGrid': 200, + 'itemsPerGrid': 20, 'tabId': 'home_tab', 'refinementPaths': ['/homes'], 'source': 'structured_search_input_header', @@ -52,7 +52,7 @@ def get_url(self, search_string: str, params: dict = None): def search(self, url: str): data = self._api_request(url) - print(data['data']['dora']['exploreV3']['metadata']['paginationMetadata']) + #print(data['data']['dora']['exploreV3']['metadata']['paginationMetadata']) pagination = data['data']['dora']['exploreV3']['metadata']['paginationMetadata'] return data, pagination diff --git a/stl/endpoint/pdp.py b/stl/endpoint/pdp.py index 1cb5392..d528b83 100644 --- a/stl/endpoint/pdp.py +++ b/stl/endpoint/pdp.py @@ -70,9 +70,9 @@ class Pdp(BaseEndpoint): SECTION_NAMES = ['amenities', 'description', 'host_profile', 'location', 'policies'] - def __init__(self, api_key: str, currency: str, proxy: str, ignore_cert: bool, throttle: bool, logger: Logger): - super().__init__(api_key, currency, proxy, ignore_cert,throttle, logger) - self.__geocoder = Geocoder() + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: bool, logger: Logger): + super().__init__(api_key, currency, proxy, ca_cert,throttle, logger) + self.__geocoder = Geocoder(proxy,ca_cert) self.__regex_amenity_id = re.compile(r'^([a-z0-9]+_)+([0-9]+)_') @staticmethod @@ -410,13 +410,14 @@ def __get_total_price(pricing) -> int | None: else: price_key = Pdp.__get_price_key(pricing) price = pricing['structuredStayDisplayPrice']['primaryLine'][price_key] + + amount_match = ''.join(filter(str.isdigit, price) ) - amount_match = int ( ''.join(filter(str.isdigit, price) ) ) - - if not amount_match: - raise ValueError('No amount match found for price: %s' % price) - - return int(amount_match) + if amount_match =='': + #raise ValueError('No amount match found for price: %s' % price) + return None + else: + return int(amount_match) @staticmethod def __html_to_text(html: str) -> str: diff --git a/stl/endpoint/reviews.py b/stl/endpoint/reviews.py index 0dd4ef1..55377aa 100644 --- a/stl/endpoint/reviews.py +++ b/stl/endpoint/reviews.py @@ -34,13 +34,16 @@ def __get_reviews_batch(self, listing_id: str, limit: int, offset: int): else: n_reviews_total = 0 - reviews = [{ - 'comments': r['comments'], - 'created_at': r['createdAt'], - 'language': r['language'], - 'rating': r['rating'], - 'response': r['response'], - } for r in pdp_reviews['reviews']] + if pdp_reviews!=None: + reviews = [{ + 'comments': r['comments'], + 'created_at': r['createdAt'], + 'language': r['language'], + 'rating': r['rating'], + 'response': r['response'], + } for r in pdp_reviews['reviews']] + else: + reviews=[] return reviews, n_reviews_total diff --git a/stl/geo/geocode.py b/stl/geo/geocode.py index 4c5c62d..cf046f0 100644 --- a/stl/geo/geocode.py +++ b/stl/geo/geocode.py @@ -7,11 +7,21 @@ class Geocoder: - def __init__(self) -> None: + def __init__(self,proxy,ca_cert) -> None: gmaps_api_key = os.environ.get('GMAPS_API_KEY') self.__gmaps = GoogleV3(api_key=gmaps_api_key) if gmaps_api_key else None user_agent = 'stl-scraper-{}'.format(randint(1, 10000)) - self.__geolocator = Nominatim(user_agent=user_agent) + + proxy = {'http': proxy, + 'https': proxy} + + import certifi + import ssl + import geopy.geocoders + + ctx = ssl.create_default_context(cafile=ca_cert) + geopy.geocoders.options.default_ssl_context = ctx + self.__geolocator = Nominatim(user_agent=user_agent,proxies=proxy) self.__osm_reverse_geo = RateLimiter(self.__geolocator.reverse, min_delay_seconds=1) def is_city(self, name: str, country: str): diff --git a/stl/scraper/airbnb_scraper.py b/stl/scraper/airbnb_scraper.py index 66b28c3..c81065e 100644 --- a/stl/scraper/airbnb_scraper.py +++ b/stl/scraper/airbnb_scraper.py @@ -17,13 +17,21 @@ def xstr(s): return '' if s is None else str(s) +def sign_currency(currency): + if currency=='EUR': + res='€' + elif currency=='USD': + res='$' + else: + res=currency + return res class AirbnbScraperInterface: def run(self, *args, **kwargs): raise NotImplementedError() class AirbnbSearchScraper(AirbnbScraperInterface): - def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: PersistenceInterface, logger: Logger): + def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: PersistenceInterface,currency: str,logger: Logger): self.__logger = logger self.__explore = explore self.__geography = {} @@ -31,6 +39,7 @@ def __init__(self, explore: Explore, pdp: Pdp, reviews: Reviews, persistence: Pe self.__pdp = pdp self.__persistence = persistence self.__reviews = reviews + self.__currency = currency def run(self, query: str, params: dict): listings = [] @@ -59,7 +68,7 @@ def run(self, query: str, params: dict): msg = '{:>4} {:<12} {:>12} {:<5}{:<9}{} {:<1} {} ({})'.format( '#' + str(n_listings), xstr(listing['city']), - '${} {}'.format(xstr(listing['price_rate']), xstr(listing['price_rate_type'])), + '{}{} {}'.format(sign_currency(self.__currency), xstr(listing['price_rate']), xstr(listing['price_rate_type'])), xstr(listing['bedrooms']) + 'br' if listing['bedrooms'] else '0br', '{:.2f}ba'.format(listing['bathrooms'] if listing['bathrooms'] else 0), xstr(listing['room_and_property_type']), From 0231ef255adb5203f8eb2700deddc0071227e7bc Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Sat, 11 Mar 2023 20:57:27 +0100 Subject: [PATCH 11/13] user-agent replaced by 'test' --- stl/command/stl_command.py | 5 ++--- stl/geo/geocode.py | 17 ++++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index dd9d7b8..3fa6a33 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -112,9 +112,8 @@ def __create_scraper( """Create scraper of given type using given parameters.""" api_key = os.getenv('AIRBNB_API_KEY') proxy = os.getenv('PROXY', None) - ca_cert = os.getenv('CA_CERT', True) - #if ignore_cert != False: - # ignore_cert = True + ca_cert = os.getenv('CA_CERT', None) + throttle = int(os.getenv('THROTTLE', 2)) if scraper_type == 'search': explore = Explore(api_key, currency, proxy, ca_cert, throttle, self.__logger) diff --git a/stl/geo/geocode.py b/stl/geo/geocode.py index cf046f0..ea1501e 100644 --- a/stl/geo/geocode.py +++ b/stl/geo/geocode.py @@ -4,6 +4,10 @@ from geopy.extra.rate_limiter import RateLimiter from random import randint +import ssl +import geopy.geocoders + + class Geocoder: @@ -15,13 +19,12 @@ def __init__(self,proxy,ca_cert) -> None: proxy = {'http': proxy, 'https': proxy} - import certifi - import ssl - import geopy.geocoders - - ctx = ssl.create_default_context(cafile=ca_cert) - geopy.geocoders.options.default_ssl_context = ctx - self.__geolocator = Nominatim(user_agent=user_agent,proxies=proxy) + if ca_cert: + ctx = ssl.create_default_context(cafile=ca_cert) + geopy.geocoders.options.default_ssl_context = ctx + geopy.geocoders.options.default_timeout = 2 + + self.__geolocator = Nominatim(user_agent=('test'),proxies=proxy) self.__osm_reverse_geo = RateLimiter(self.__geolocator.reverse, min_delay_seconds=1) def is_city(self, name: str, country: str): From 07be76db85da803263a0d484696ec757e66e6cfc Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Mon, 13 Mar 2023 01:34:02 +0100 Subject: [PATCH 12/13] fix small erros --- stl/command/stl_command.py | 12 ++++-------- stl/endpoint/base_endpoint.py | 7 +++---- stl/geo/geocode.py | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/stl/command/stl_command.py b/stl/command/stl_command.py index 3fa6a33..d926263 100644 --- a/stl/command/stl_command.py +++ b/stl/command/stl_command.py @@ -81,22 +81,18 @@ def execute(self): scraper.run(source, self.__args.get('--updated')) elif self.__args.get('data'): - ignore_cert = os.getenv('IGNORE_CERT',False) - if ignore_cert != False and ignore_cert!=0: - ignore_cert = True + ca_cert = os.getenv('CA_CERT', None) throttle = int(os.getenv('THROTTLE', 2)) - pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) + pdp = Pdp(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger) print(json.dumps(pdp.get_raw_listing(self.__args.get('')))) elif self.__args.get('pricing'): listing_id = self.__args.get('') checkin = self.__args.get('--checkin') checkout = self.__args.get('--checkout') - ignore_cert = os.getenv('IGNORE_CERT', False) - if ignore_cert != False: - ignore_cert = True + ca_cert = os.getenv('CA_CERT', None) throttle = int(os.getenv('THROTTLE', 2)) - pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ignore_cert, throttle, self.__logger) + pricing = Pricing(os.getenv('AIRBNB_API_KEY'), currency, os.getenv('PROXY', None), ca_cert, throttle, self.__logger) total = pricing.get_pricing(checkin, checkout, listing_id) print('https://www.airbnb.com/rooms/{} - {} to {}: {}'.format(listing_id, checkin, checkout, total)) diff --git a/stl/endpoint/base_endpoint.py b/stl/endpoint/base_endpoint.py index 17cf132..2daa167 100644 --- a/stl/endpoint/base_endpoint.py +++ b/stl/endpoint/base_endpoint.py @@ -44,16 +44,15 @@ def _api_request(self, url: str, method: str = 'GET', data=None) -> dict: sleep(randint(0,self._throttle)) # do a little throttling attempts += 1 response = requests.request(method, url, headers=headers, data=data, proxies=self._proxy, verify=self._ca_cert) - if response.text=='Proxy server error': - errors='Proxy server error' - #self.__handle_api_error(url, errors) - else: + try: response_json = response.json() errors = response_json.get('errors') if not errors: return response_json else: self.__handle_api_error(url, errors) + except: + print(f'ERROR ap_request -- {response.text}') raise ApiException(['Could not complete API {} request to "{}"'.format(method, url)]) diff --git a/stl/geo/geocode.py b/stl/geo/geocode.py index ea1501e..f6aa5aa 100644 --- a/stl/geo/geocode.py +++ b/stl/geo/geocode.py @@ -14,7 +14,7 @@ class Geocoder: def __init__(self,proxy,ca_cert) -> None: gmaps_api_key = os.environ.get('GMAPS_API_KEY') self.__gmaps = GoogleV3(api_key=gmaps_api_key) if gmaps_api_key else None - user_agent = 'stl-scraper-{}'.format(randint(1, 10000)) + user_agent = 'stl-scraper-test-{}'.format(randint(1, 100000)) proxy = {'http': proxy, 'https': proxy} @@ -24,7 +24,7 @@ def __init__(self,proxy,ca_cert) -> None: geopy.geocoders.options.default_ssl_context = ctx geopy.geocoders.options.default_timeout = 2 - self.__geolocator = Nominatim(user_agent=('test'),proxies=proxy) + self.__geolocator = Nominatim(user_agent=(user_agent),proxies=proxy) self.__osm_reverse_geo = RateLimiter(self.__geolocator.reverse, min_delay_seconds=1) def is_city(self, name: str, country: str): From 03d8b7d545eaf2d7e41e932388d7b90dfa994ea6 Mon Sep 17 00:00:00 2001 From: jhill-cmd Date: Mon, 13 Mar 2023 02:05:53 +0100 Subject: [PATCH 13/13] bug fixes --- stl/endpoint/calendar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/endpoint/calendar.py b/stl/endpoint/calendar.py index b6ee07f..0ef0603 100644 --- a/stl/endpoint/calendar.py +++ b/stl/endpoint/calendar.py @@ -133,8 +133,8 @@ class Calendar(BaseEndpoint): API_PATH = '/api/v3/PdpAvailabilityCalendar' N_MONTHS = 12 # number of months of data to return; 12 months == 1 year - def __init__(self, api_key: str, currency: str, logger: Logger, pricing: Pricing): - super().__init__(api_key, currency, logger) + def __init__(self, api_key: str, currency: str, proxy: str, ca_cert: str, throttle: int, logger: Logger, pricing: Pricing): + super().__init__(api_key, currency, proxy, ca_cert, throttle, logger) self.__pricing = pricing self.__today = datetime.today()