From b23cf9a4f3f1d76e758d9cc6b4451d1f35b3ee00 Mon Sep 17 00:00:00 2001 From: Matthew Date: Wed, 29 May 2024 18:56:13 +0300 Subject: [PATCH] Standardizing all actions (#26) * Deleted attributes search * Changed tests * New test for RecaptchaSolver and Updated Structure of MockServer * RecacptchaSolverResponse * Black formatting * Deprecation warning and RecaptchaSolverSpider * Formatting * Updated test for RecaptchaSolver * Fixed RecaptchaSolverResponse * Fix: DeprecationWarning * Python version update in GitHub Actions --- .github/workflows/python-test.yml | 4 +- README.md | 2 +- examples/settings.py | 10 +- examples/spiders/auto_recaptcha.py | 47 ++--- examples/spiders/manual_recaptcha.py | 44 +++-- examples/spiders/meduza.py | 12 +- examples/spiders/webscraperio.py | 110 +++++++----- scrapypuppeteer/actions.py | 118 ++++++------ scrapypuppeteer/middleware.py | 259 ++++++++++++++++----------- scrapypuppeteer/request.py | 51 +++--- scrapypuppeteer/response.py | 122 ++++++++----- setup.py | 48 ++--- tests/actions/constants.py | 10 +- tests/actions/test_actions.py | 44 +++-- tests/middleware/test_middleware.py | 15 +- tests/mockserver.py | 179 +++++++++--------- tests/spiders.py | 165 ++++++++++------- 17 files changed, 711 insertions(+), 529 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 14331cd..fc0d5c8 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -8,10 +8,12 @@ jobs: strategy: matrix: include: - - python-version: "3.7.x" # Min Python version (No 3.6 version) + - python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository) - python-version: "3.8.x" - python-version: "3.9.x" - python-version: "3.10.x" + - python-version: "3.11.x" + - python-version: "3.12.x" - python-version: "3.x" # Last Python version steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index ff14805..7a7a350 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ class MySpider(scrapy.Spider): ## Advanced usage `PuppeteerRequest`'s first argument is a browser action. -Avalable actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`. +Available actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`. Passing a URL into request is a shortcut for `GoTo(url)` action. Here is the list of available actions: diff --git a/examples/settings.py b/examples/settings.py index 3e72b65..bdfcff7 100644 --- a/examples/settings.py +++ b/examples/settings.py @@ -1,12 +1,12 @@ -BOT_NAME = 'scrapypuppeteer' +BOT_NAME = "scrapypuppeteer" -SPIDER_MODULES = ['examples.spiders'] -NEWSPIDER_MODULE = 'examples.spiders' +SPIDER_MODULES = ["examples.spiders"] +NEWSPIDER_MODULE = "examples.spiders" CONCURRENT_REQUESTS = 1 DOWNLOADER_MIDDLEWARES = { - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042 } -PUPPETEER_SERVICE_URL = 'http://localhost:3000' +PUPPETEER_SERVICE_URL = "http://localhost:3000" diff --git a/examples/spiders/auto_recaptcha.py b/examples/spiders/auto_recaptcha.py index 3bd4881..7d45478 100644 --- a/examples/spiders/auto_recaptcha.py +++ b/examples/spiders/auto_recaptcha.py @@ -14,37 +14,44 @@ class AutoRecaptchaSpider(scrapy.Spider): start_urls = ["https://www.google.com/recaptcha/api2/demo"] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1041, - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware": 1041, + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + "PUPPETEER_INCLUDE_META": True, + "RECAPTCHA_ACTIVATION": True, + "RECAPTCHA_SOLVING": True, + "RECAPTCHA_SUBMIT_SELECTORS": { + "www.google.com/recaptcha/api2/demo": "#recaptcha-demo-submit", }, - 'PUPPETEER_INCLUDE_META': True, - - 'RECAPTCHA_ACTIVATION': True, - 'RECAPTCHA_SOLVING': True, - 'RECAPTCHA_SUBMIT_SELECTORS': { - 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit', - } } def start_requests(self): for url in self.start_urls: action = GoTo(url=url) - yield PuppeteerRequest(action=action, callback=self.parse_html, errback=self.error, close_page=False) + yield PuppeteerRequest( + action=action, + callback=self.parse_html, + errback=self.error, + close_page=False, + ) def parse_html(self, response: PuppeteerResponse, **kwargs): - with open(f"recaptcha_page.html", 'wb') as f: + with open(f"recaptcha_page.html", "wb") as f: f.write(response.body) - action = Screenshot(options={ - 'full_page': True, - }) - yield response.follow(action, - callback=self.make_screenshot, - errback=self.error, - close_page=True) + action = Screenshot( + options={ + "full_page": True, + } + ) + yield response.follow( + action, callback=self.make_screenshot, errback=self.error, close_page=True + ) def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs): - data = response.screenshot # Note that data is string containing bytes, don't forget to decode them! + data = ( + response.screenshot + ) # Note that data is string containing bytes, don't forget to decode them! with open("imageToSave.png", "wb") as fh: fh.write(base64.b64decode(data)) diff --git a/examples/spiders/manual_recaptcha.py b/examples/spiders/manual_recaptcha.py index 797ba2b..ba2e66f 100644 --- a/examples/spiders/manual_recaptcha.py +++ b/examples/spiders/manual_recaptcha.py @@ -16,29 +16,47 @@ class ManualRecaptchaSpider(scrapy.Spider): def start_requests(self): for url in self.start_urls: action = GoTo(url=url) - yield PuppeteerRequest(action=action, callback=self.solve_recaptcha, errback=self.error, close_page=False) + yield PuppeteerRequest( + action=action, + callback=self.solve_recaptcha, + errback=self.error, + close_page=False, + ) def solve_recaptcha(self, response: PuppeteerResponse, **kwargs): action = RecaptchaSolver() - yield response.follow(action=action, callback=self.submit_recaptcha, errback=self.error, close_page=False) + yield response.follow( + action=action, + callback=self.submit_recaptcha, + errback=self.error, + close_page=False, + ) def submit_recaptcha(self, response, **kwargs): - action = Click('#recaptcha-demo-submit') - yield response.follow(action=action, callback=self.parse_html, errback=self.error, close_page=False) + action = Click("#recaptcha-demo-submit") + yield response.follow( + action=action, + callback=self.parse_html, + errback=self.error, + close_page=False, + ) def parse_html(self, response: PuppeteerResponse, **kwargs): - with open(f"recaptcha_page.html", 'wb') as f: + with open(f"recaptcha_page.html", "wb") as f: f.write(response.body) - action = Screenshot(options={ - 'full_page': True, - }) - yield response.follow(action, - callback=self.make_screenshot, - errback=self.error, - close_page=True) + action = Screenshot( + options={ + "full_page": True, + } + ) + yield response.follow( + action, callback=self.make_screenshot, errback=self.error, close_page=True + ) def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs): - data = response.screenshot # Note that data is string containing bytes, don't forget to decode them! + data = ( + response.screenshot + ) # Note that data is string containing bytes, don't forget to decode them! with open("imageToSave.png", "wb") as fh: fh.write(base64.b64decode(data)) diff --git a/examples/spiders/meduza.py b/examples/spiders/meduza.py index 948d024..6abf5c3 100644 --- a/examples/spiders/meduza.py +++ b/examples/spiders/meduza.py @@ -4,18 +4,18 @@ class MeduzaSpider(scrapy.Spider): - name = 'meduza' + name = "meduza" def start_requests(self): - yield PuppeteerRequest('https://meduza.io', callback=self.parse_main_page) + yield PuppeteerRequest("https://meduza.io", callback=self.parse_main_page) def parse_main_page(self, response: PuppeteerHtmlResponse): - for article_url in response.css('a.Link-isInBlockTitle::attr(href)').getall(): + for article_url in response.css("a.Link-isInBlockTitle::attr(href)").getall(): yield response.follow(article_url, callback=self.parse_article) def parse_article(self, response: PuppeteerHtmlResponse): yield { - 'url': response.url, - 'title': response.css('h1::text').get(), - 'text': '\n'.join(response.css('p.SimpleBlock-p::text').getall()) + "url": response.url, + "title": response.css("h1::text").get(), + "text": "\n".join(response.css("p.SimpleBlock-p::text").getall()), } diff --git a/examples/spiders/webscraperio.py b/examples/spiders/webscraperio.py index 54712ae..661f021 100644 --- a/examples/spiders/webscraperio.py +++ b/examples/spiders/webscraperio.py @@ -8,103 +8,115 @@ class EcommerceSiteSpider(scrapy.Spider): @staticmethod def extract_items(list_page_response): - for item_selector in list_page_response.css('div.row div.thumbnail'): + for item_selector in list_page_response.css("div.row div.thumbnail"): yield { - 'link': item_selector.css('a.title::attr(href)').get(), - 'title': item_selector.css('a.title::attr(title)').get(), - 'price': item_selector.css('h4.price::text').get(), - 'description': item_selector.css('p.description::text').get(), - 'rating': len(item_selector.css('span.glyphicon-star')), - 'reviews_count': int(item_selector - .css('.ratings p.pull-right::text') - .re_first(r'\d+')) + "link": item_selector.css("a.title::attr(href)").get(), + "title": item_selector.css("a.title::attr(title)").get(), + "price": item_selector.css("h4.price::text").get(), + "description": item_selector.css("p.description::text").get(), + "rating": len(item_selector.css("span.glyphicon-star")), + "reviews_count": int( + item_selector.css(".ratings p.pull-right::text").re_first(r"\d+") + ), } @staticmethod def extract_item(detail_page_response): yield { - 'link': detail_page_response.url, - 'title': detail_page_response.css('h4.price + h4::text').get(), - 'price': detail_page_response.css('h4.price::text').get(), - 'description': detail_page_response.css('p.description::text').get(), - 'rating': len(detail_page_response.css('span.glyphicon-star')), - 'reviews_count': int(detail_page_response - .css('.ratings::text') - .re_first('\d+')) + "link": detail_page_response.url, + "title": detail_page_response.css("h4.price + h4::text").get(), + "price": detail_page_response.css("h4.price::text").get(), + "description": detail_page_response.css("p.description::text").get(), + "rating": len(detail_page_response.css("span.glyphicon-star")), + "reviews_count": int( + detail_page_response.css(".ratings::text").re_first("\d+") + ), } class AjaxPaginationSpider(EcommerceSiteSpider): - name = 'e-commerce-ajax' + name = "e-commerce-ajax" def __init__(self, **kwargs): super().__init__(**kwargs) - self.start_url = 'https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops' + self.start_url = ( + "https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops" + ) self.next_page_ix = 1 def start_requests(self): - yield PuppeteerRequest(GoTo(self.start_url), - close_page=False, - callback=self.process_list_page) + yield PuppeteerRequest( + GoTo(self.start_url), close_page=False, callback=self.process_list_page + ) def process_list_page(self, response): yield from self.extract_items(response) self.next_page_ix += 1 next_page_selector = f'button[data-id="{self.next_page_ix}"]' if response.css(next_page_selector): - yield response.follow(Click(next_page_selector, - wait_options={'selectorOrTimeout': 3000}), - close_page=False, - callback=self.process_list_page) + yield response.follow( + Click(next_page_selector, wait_options={"selectorOrTimeout": 3000}), + close_page=False, + callback=self.process_list_page, + ) class MoreSpider(EcommerceSiteSpider): - name = 'e-commerce-more' + name = "e-commerce-more" def __init__(self, **kwargs): super().__init__(**kwargs) - self.start_url = 'https://webscraper.io/test-sites/e-commerce/more/computers/laptops' + self.start_url = ( + "https://webscraper.io/test-sites/e-commerce/more/computers/laptops" + ) self.seen_item_links = set() def start_requests(self): - yield PuppeteerRequest(GoTo(self.start_url, wait_options={'selectorOrTimeout': 10000}), - close_page=False, - callback=self.process_list_page) + yield PuppeteerRequest( + GoTo(self.start_url, wait_options={"selectorOrTimeout": 10000}), + close_page=False, + callback=self.process_list_page, + ) def process_list_page(self, response): for item in self.extract_items(response): - if item['link'] not in self.seen_item_links: - self.seen_item_links.add(item['link']) + if item["link"] not in self.seen_item_links: + self.seen_item_links.add(item["link"]) yield item - more_selector = '.ecomerce-items-scroll-more' + more_selector = ".ecomerce-items-scroll-more" more_button = response.css(more_selector) - if 'style' not in more_button.attrib: - yield response.follow(Click(more_selector, - wait_options={'selectorOrTimeout': 1000}), - close_page=False, - callback=self.process_list_page) + if "style" not in more_button.attrib: + yield response.follow( + Click(more_selector, wait_options={"selectorOrTimeout": 1000}), + close_page=False, + callback=self.process_list_page, + ) class ScrollSpider(EcommerceSiteSpider): - name = 'e-commerce-scroll' + name = "e-commerce-scroll" def __init__(self, **kwargs): super().__init__(**kwargs) - self.start_url = 'https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops' + self.start_url = ( + "https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops" + ) self.seen_item_links = set() def start_requests(self): - yield PuppeteerRequest(GoTo(self.start_url), - close_page=False, - callback=self.process_list_page) + yield PuppeteerRequest( + GoTo(self.start_url), close_page=False, callback=self.process_list_page + ) def process_list_page(self, response): items = self.extract_items(response) - new_items = [i for i in items if i['link'] not in self.seen_item_links] + new_items = [i for i in items if i["link"] not in self.seen_item_links] if new_items: for item in new_items: - self.seen_item_links.add(item['link']) + self.seen_item_links.add(item["link"]) yield item - yield response.follow(Scroll(wait_options={'selectorOrTimeout': 1000}), - close_page=False, - callback=self.process_list_page) + yield response.follow( + Scroll(wait_options={"selectorOrTimeout": 1000}), + close_page=False, + callback=self.process_list_page, + ) diff --git a/scrapypuppeteer/actions.py b/scrapypuppeteer/actions.py index 16abd54..141a703 100644 --- a/scrapypuppeteer/actions.py +++ b/scrapypuppeteer/actions.py @@ -5,14 +5,12 @@ class PuppeteerServiceAction(ABC): @property @abstractmethod - def endpoint(self): - ... + def endpoint(self): ... - content_type = 'application/json' + content_type = "application/json" @abstractmethod - def payload(self): - ... + def payload(self): ... class GoTo(PuppeteerServiceAction): @@ -57,18 +55,20 @@ class GoTo(PuppeteerServiceAction): """ - endpoint = 'goto' + endpoint = "goto" - def __init__(self, url: str, navigation_options: dict = None, wait_options: dict = None): + def __init__( + self, url: str, navigation_options: dict = None, wait_options: dict = None + ): self.url = url self.navigation_options = navigation_options self.wait_options = wait_options def payload(self): return { - 'url': self.url, - 'navigationOptions': self.navigation_options, - 'waitOptions': self.wait_options + "url": self.url, + "navigationOptions": self.navigation_options, + "waitOptions": self.wait_options, } @@ -81,7 +81,7 @@ class GoForward(PuppeteerServiceAction): """ - endpoint = 'forward' + endpoint = "forward" def __init__(self, navigation_options: dict = None, wait_options: dict = None): self.navigation_options = navigation_options @@ -89,8 +89,8 @@ def __init__(self, navigation_options: dict = None, wait_options: dict = None): def payload(self): return { - 'navigationOptions': self.navigation_options, - 'waitOptions': self.wait_options + "navigationOptions": self.navigation_options, + "waitOptions": self.wait_options, } @@ -103,7 +103,7 @@ class GoBack(PuppeteerServiceAction): """ - endpoint = 'back' + endpoint = "back" def __init__(self, navigation_options: dict = None, wait_options: dict = None): self.navigation_options = navigation_options @@ -111,8 +111,8 @@ def __init__(self, navigation_options: dict = None, wait_options: dict = None): def payload(self): return { - 'navigationOptions': self.navigation_options, - 'waitOptions': self.wait_options + "navigationOptions": self.navigation_options, + "waitOptions": self.wait_options, } @@ -140,12 +140,15 @@ class Click(PuppeteerServiceAction): """ - endpoint = 'click' + endpoint = "click" - def __init__(self, selector: str, - click_options: dict = None, - wait_options: dict = None, - navigation_options: dict = None): + def __init__( + self, + selector: str, + click_options: dict = None, + wait_options: dict = None, + navigation_options: dict = None, + ): self.selector = selector self.click_options = click_options self.wait_options = wait_options @@ -153,10 +156,10 @@ def __init__(self, selector: str, def payload(self): return { - 'selector': self.selector, - 'clickOptions': self.click_options, - 'waitOptions': self.wait_options, - 'navigationOptions': self.navigation_options + "selector": self.selector, + "clickOptions": self.click_options, + "waitOptions": self.wait_options, + "navigationOptions": self.navigation_options, } @@ -172,17 +175,14 @@ class Scroll(PuppeteerServiceAction): """ - endpoint = 'scroll' + endpoint = "scroll" def __init__(self, selector: str = None, wait_options: dict = None): self.selector = selector self.wait_options = wait_options def payload(self): - return { - 'selector': self.selector, - 'waitOptions': self.wait_options - } + return {"selector": self.selector, "waitOptions": self.wait_options} class Screenshot(PuppeteerServiceAction): @@ -213,51 +213,49 @@ class Screenshot(PuppeteerServiceAction): """ - endpoint = 'screenshot' + endpoint = "screenshot" def __init__(self, options: dict = None, **kwargs): self.options = options or {} self.options.update(kwargs) def payload(self): - return { - 'options': self.options - } + return {"options": self.options} class RecaptchaSolver(PuppeteerServiceAction): """ - Tries to solve recaptcha on the page. - First it tries to find recaptcha. If it couldn't find a recaptcha nothing - will happen to your 2captcha balance. - Then it solves recaptcha with 2captcha service and inserts the special code - into the page automatically. - Note that it does not click buttons like "submit buttons". - - Params: - solve_recaptcha - bool = True: enables automatic solving of recaptcha on the page if found. - If false is provided recaptcha will still be detected on the page but not solved. - You can get info about found recaptchas via return value. - close_on_empty - bool = False: whether to close page or not if there was no captcha on the page. - - Response for this action is PuppeteerJsonResponse. You can get the return values - via self.data['recaptcha_data']. - You can visit https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object - to get information about return value. + Tries to solve recaptcha on the page. + First it tries to find recaptcha. If it couldn't find a recaptcha nothing + will happen to your 2captcha balance. + Then it solves recaptcha with 2captcha service and inserts the special code + into the page automatically. + Note that it does not click buttons like "submit buttons". + + Params: + solve_recaptcha - bool = True: enables automatic solving of recaptcha on the page if found. + If false is provided recaptcha will still be detected on the page but not solved. + You can get info about found recaptchas via return value. + close_on_empty - bool = False: whether to close page or not if there was no captcha on the page. + + Response for this action is PuppeteerJsonResponse. You can get the return values + via self.data['recaptcha_data']. + You can visit https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object + to get information about return value. """ - endpoint = 'recaptcha_solver' - def __init__(self, - solve_recaptcha: bool = True, - close_on_empty: bool = False, - **kwargs): + endpoint = "recaptcha_solver" + + def __init__( + self, solve_recaptcha: bool = True, close_on_empty: bool = False, **kwargs + ): self.solve_recaptcha = solve_recaptcha self.close_on_empty = close_on_empty def payload(self): return { - 'solve_recaptcha': self.solve_recaptcha, - 'close_on_empty': self.close_on_empty + "solve_recaptcha": self.solve_recaptcha, + "close_on_empty": self.close_on_empty, } @@ -277,8 +275,8 @@ class CustomJsAction(PuppeteerServiceAction): """ - endpoint = 'action' - content_type = 'application/javascript' + endpoint = "action" + content_type = "application/javascript" def __init__(self, js_action: str): self.js_action = js_action diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index d6aab22..8e1caaf 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -9,8 +9,23 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured from scrapy.http import Headers, TextResponse -from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, RecaptchaSolver, Screenshot, Scroll, CustomJsAction -from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse, PuppeteerScreenshotResponse, PuppeteerJsonResponse +from scrapypuppeteer.actions import ( + Click, + GoBack, + GoForward, + GoTo, + RecaptchaSolver, + Screenshot, + Scroll, + CustomJsAction, +) +from scrapypuppeteer.response import ( + PuppeteerResponse, + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerJsonResponse, +) from scrapypuppeteer.request import ActionRequest, PuppeteerRequest @@ -42,16 +57,18 @@ class PuppeteerServiceDownloaderMiddleware: Default to False. """ - SERVICE_URL_SETTING = 'PUPPETEER_SERVICE_URL' - INCLUDE_HEADERS_SETTING = 'PUPPETEER_INCLUDE_HEADERS' - SERVICE_META_SETTING = 'PUPPETEER_INCLUDE_META' - DEFAULT_INCLUDE_HEADERS = ['Cookie'] # TODO send them separately - - def __init__(self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool): + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + ): self.service_base_url = service_url self.include_headers = include_headers self.include_meta = include_meta @@ -62,7 +79,7 @@ def __init__(self, def from_crawler(cls, crawler): service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) if service_url is None: - raise ValueError('Puppeteer service URL must be provided') + raise ValueError("Puppeteer service URL must be provided") if cls.INCLUDE_HEADERS_SETTING in crawler.settings: try: include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) @@ -72,8 +89,9 @@ def from_crawler(cls, crawler): include_headers = cls.DEFAULT_INCLUDE_HEADERS include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) middleware = cls(crawler, service_url, include_headers, include_meta) - crawler.signals.connect(middleware.close_used_contexts, - signal=signals.spider_closed) + crawler.signals.connect( + middleware.close_used_contexts, signal=signals.spider_closed + ) return middleware def process_request(self, request, spider): @@ -84,24 +102,21 @@ def process_request(self, request, spider): service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) if service_params: - service_url += '?' + service_params + service_url += "?" + service_params meta = { - 'puppeteer_request': request, - 'dont_obey_robotstxt': True, - 'proxy': None + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, } if self.include_meta: - meta = { - **request.meta, - **meta - } + meta = {**request.meta, **meta} return ActionRequest( url=service_url, action=action, - method='POST', - headers=Headers({'Content-Type': action.content_type}), + method="POST", + headers=Headers({"Content-Type": action.content_type}), body=self._serialize_body(action, request), dont_filter=True, cookies=request.cookies, @@ -109,35 +124,41 @@ def process_request(self, request, spider): callback=request.callback, cb_kwargs=request.cb_kwargs, errback=request.errback, - meta=meta + meta=meta, ) @staticmethod def _encode_service_params(request): service_params = {} if request.context_id is not None: - service_params['contextId'] = request.context_id + service_params["contextId"] = request.context_id if request.page_id is not None: - service_params['pageId'] = request.page_id + service_params["pageId"] = request.page_id if request.close_page: - service_params['closePage'] = 1 + service_params["closePage"] = 1 return urlencode(service_params) def _serialize_body(self, action, request): payload = action.payload() - if action.content_type == 'application/json': + if action.content_type == "application/json": if isinstance(payload, dict): # disallow null values in top-level request parameters payload = {k: v for k, v in payload.items() if v is not None} - proxy = request.meta.get('proxy') + proxy = request.meta.get("proxy") if proxy: - payload['proxy'] = proxy - include_headers = self.include_headers if request.include_headers is None else request.include_headers + payload["proxy"] = proxy + include_headers = ( + self.include_headers + if request.include_headers is None + else request.include_headers + ) if include_headers: headers = request.headers.to_unicode_dict() if isinstance(include_headers, list): - headers = {h.lower(): headers[h] for h in include_headers if h in headers} - payload['headers'] = headers + headers = { + h.lower(): headers[h] for h in include_headers if h in headers + } + payload["headers"] = headers return json.dumps(payload) return str(payload) @@ -145,38 +166,36 @@ def process_response(self, request, response, spider): if not isinstance(response, TextResponse): return response - puppeteer_request = request.meta.get('puppeteer_request') + puppeteer_request = request.meta.get("puppeteer_request") if puppeteer_request is None: return response - if b'application/json' not in response.headers.get(b'Content-Type', b''): + if b"application/json" not in response.headers.get(b"Content-Type", b""): return response.replace(request=request) response_data = json.loads(response.text) response_cls = self._get_response_class(puppeteer_request.action) if response.status != 200: - context_id = response_data.get('contextId') + context_id = response_data.get("contextId") if context_id: self.used_contexts[id(spider)].add(context_id) return response - return self._form_response(response_cls, response_data, - puppeteer_request.url, request, puppeteer_request, - spider) - - def _form_response(self, response_cls, response_data, - url, request, puppeteer_request, - spider): - context_id = response_data.pop('contextId', puppeteer_request.context_id) - page_id = response_data.pop('pageId', puppeteer_request.page_id) + return self._form_response( + response_cls, + response_data, + puppeteer_request.url, + request, + puppeteer_request, + spider, + ) - attributes = dict() - for attr in response_cls.attributes: - if attr in response_data: - attributes[attr] = response_data.pop(attr) - if response_data: - attributes['data'] = response_data + def _form_response( + self, response_cls, response_data, url, request, puppeteer_request, spider + ): + context_id = response_data.pop("contextId", puppeteer_request.context_id) + page_id = response_data.pop("pageId", puppeteer_request.page_id) self.used_contexts[id(spider)].add(context_id) @@ -186,7 +205,7 @@ def _form_response(self, response_cls, response_data, context_id=context_id, page_id=page_id, request=request, - **attributes + **response_data, ) @staticmethod @@ -195,16 +214,20 @@ def _get_response_class(request_action): return PuppeteerHtmlResponse if isinstance(request_action, Screenshot): return PuppeteerScreenshotResponse + if isinstance(request_action, RecaptchaSolver): + return PuppeteerRecaptchaSolverResponse return PuppeteerJsonResponse def close_used_contexts(self, spider): contexts = list(self.used_contexts[id(spider)]) if contexts: - request = Request(urljoin(self.service_base_url, '/close_context'), - method='POST', - headers=Headers({'Content-Type': 'application/json'}), - meta={"proxy": None}, - body=json.dumps(contexts)) + request = Request( + urljoin(self.service_base_url, "/close_context"), + method="POST", + headers=Headers({"Content-Type": "application/json"}), + meta={"proxy": None}, + body=json.dumps(contexts), + ) return self.crawler.engine.downloader.fetch(request, None) @@ -247,9 +270,7 @@ class PuppeteerRecaptchaDownloaderMiddleware: RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - def __init__(self, - recaptcha_solving: bool, - submit_selectors: dict): + def __init__(self, recaptcha_solving: bool, submit_selectors: dict): self.submit_selectors = submit_selectors self.recaptcha_solving = recaptcha_solving self._page_responses = dict() @@ -263,109 +284,135 @@ def from_crawler(cls, crawler: Crawler): recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) try: - submit_selectors = crawler.settings.getdict(cls.SUBMIT_SELECTORS_SETTING, dict()) + submit_selectors = crawler.settings.getdict( + cls.SUBMIT_SELECTORS_SETTING, dict() + ) except ValueError: - submit_selectors = {'': crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, '')} + submit_selectors = { + "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") + } except Exception as exception: - raise ValueError(f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}") + raise ValueError( + f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" + ) for key in submit_selectors.keys(): submit_selector = submit_selectors[key] if isinstance(submit_selector, str): submit_selectors[key] = Click(selector=submit_selector) elif not isinstance(submit_selector, Click): - raise ValueError("Submit selector must be str or Click," - f"but {type(submit_selector)} provided") + raise ValueError( + "Submit selector must be str or Click," + f"but {type(submit_selector)} provided" + ) return cls(recaptcha_solving, submit_selectors) def process_request(self, request, spider): - if request.meta.get('dont_recaptcha', False): + if request.meta.get("dont_recaptcha", False): return None if isinstance(request, PuppeteerRequest): - if request.close_page and not request.meta.get('_captcha_submission', False): + if request.close_page and not request.meta.get( + "_captcha_submission", False + ): request.close_page = False request.dont_filter = True self._page_closing.add(request) return request return None - def process_response(self, - request, response, - spider): - if not isinstance(response, PuppeteerResponse): # We only work with PuppeteerResponses + def process_response(self, request, response, spider): + if not isinstance( + response, PuppeteerResponse + ): # We only work with PuppeteerResponses return response puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get('dont_recaptcha', False): # Skip such responses + if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses return response - if puppeteer_request.meta.pop('_captcha_submission', False): # Submitted captcha + if puppeteer_request.meta.pop( + "_captcha_submission", False + ): # Submitted captcha return self.__gen_response(response) - if puppeteer_request.meta.pop('_captcha_solving', False): + if puppeteer_request.meta.pop("_captcha_solving", False): # RECaptchaSolver was called by recaptcha middleware return self._submit_recaptcha(request, response, spider) - if isinstance(puppeteer_request.action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver)): - # No recaptcha after this action + if isinstance( + puppeteer_request.action, + (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), + ): + # No recaptcha after these actions return response - # Any puppeteer response besides RecaptchaSolver's PuppeteerResponse + # Any puppeteer response besides PuppeteerRecaptchaSolverResponse return self._solve_recaptcha(request, response) def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = response # Saving main response to return it later + self._page_responses[response.page_id] = ( + response # Saving main response to return it later + ) - recaptcha_solver = RecaptchaSolver(solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False)) - return response.follow(recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={'_captcha_solving': True}, - close_page=False) + recaptcha_solver = RecaptchaSolver( + solve_recaptcha=self.recaptcha_solving, + close_on_empty=self.__is_closing(response, remove_request=False), + ) + return response.follow( + recaptcha_solver, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta={"_captcha_solving": True}, + close_page=False, + ) def _submit_recaptcha(self, request, response, spider): - response_data = response.data if not response.puppeteer_request.action.solve_recaptcha: - spider.log(message=f"Found {len(response_data['recaptcha_data']['captchas'])} captcha " - f"but did not solve due to argument", - level=logging.INFO) + spider.log( + message=f"Found {len(response.recaptcha_data['captchas'])} captcha " + f"but did not solve due to argument", + level=logging.INFO, + ) return self.__gen_response(response) # Click "submit button"? - if response_data['recaptcha_data']['captchas'] and self.submit_selectors: + if response.recaptcha_data["captchas"] and self.submit_selectors: # We need to click "submit button" for domain, submitting in self.submit_selectors.items(): if domain in response.url: if not submitting.selector: return self.__gen_response(response) - return response.follow(action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={'_captcha_submission': True}) - raise IgnoreRequest("No submit selector found to click on the page but captcha found") + return response.follow( + action=submitting, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + close_page=self.__is_closing(response), + meta={"_captcha_submission": True}, + ) + raise IgnoreRequest( + "No submit selector found to click on the page but captcha found" + ) return self.__gen_response(response) def __gen_response(self, response): main_response_data = dict() - main_response_data['page_id'] = None if self.__is_closing(response) else response.puppeteer_request.page_id + main_response_data["page_id"] = ( + None if self.__is_closing(response) else response.puppeteer_request.page_id + ) main_response = self._page_responses.pop(response.page_id) if isinstance(main_response, PuppeteerHtmlResponse): if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data['body'] = response.data['html'] + main_response_data["body"] = response.html elif isinstance(response.puppeteer_request.action, Click): - main_response_data['body'] = response.body + main_response_data["body"] = response.body return main_response.replace(**main_response_data) - def __is_closing(self, response, - remove_request: bool = True) -> bool: + def __is_closing(self, response, remove_request: bool = True) -> bool: main_request = self._page_responses[response.page_id].puppeteer_request close_page = main_request in self._page_closing if close_page and remove_request: diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index 3c2bc8d..4ffc477 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -11,19 +11,14 @@ class ActionRequest(Request): beautified representation. """ - attributes: Tuple[str, ...] = Request.attributes + ( - 'action', - ) + attributes: Tuple[str, ...] = Request.attributes + ("action",) """ A tuple of :class:`str` objects containing the name of all public attributes of the class that are also keyword parameters of the ``__init__`` method. """ - def __init__(self, - url: str, - action: Union[str, PuppeteerServiceAction], - **kwargs): + def __init__(self, url: str, action: Union[str, PuppeteerServiceAction], **kwargs): self.action = action super().__init__(url, **kwargs) @@ -36,14 +31,14 @@ def __str__(self): class PuppeteerRequest(ActionRequest): """ - Request to be executed in browser with puppeteer. + Request to be executed in browser with puppeteer. """ attributes: Tuple[str, ...] = ActionRequest.attributes + ( - 'context_id', - 'page_id', - 'close_page', - 'include_headers' + "context_id", + "page_id", + "close_page", + "include_headers", ) """ A tuple of :class:`str` objects containing the name of all public @@ -53,13 +48,15 @@ class PuppeteerRequest(ActionRequest): Currently used by :meth:`PuppeteerRequest.replace` """ - def __init__(self, - action: Union[str, PuppeteerServiceAction], - context_id: str = None, - page_id: str = None, - close_page: bool = True, - include_headers: Union[bool, List[str]] = None, - **kwargs): + def __init__( + self, + action: Union[str, PuppeteerServiceAction], + context_id: str = None, + page_id: str = None, + close_page: bool = True, + include_headers: Union[bool, List[str]] = None, + **kwargs, + ): """ :param action: URL or browser action @@ -76,18 +73,22 @@ def __init__(self, or None (default, let middleware decide) :param kwargs: """ - url = kwargs.pop('url', None) + url = kwargs.pop("url", None) if isinstance(action, str): url = action - navigation_options = kwargs.pop('navigation_options', None) - wait_options = kwargs.pop('wait_options', None) - action = GoTo(url, navigation_options=navigation_options, wait_options=wait_options) + navigation_options = kwargs.pop("navigation_options", None) + wait_options = kwargs.pop("wait_options", None) + action = GoTo( + url, navigation_options=navigation_options, wait_options=wait_options + ) elif isinstance(action, GoTo): url = action.url elif not isinstance(action, PuppeteerServiceAction): - raise ValueError('Undefined browser action') + raise ValueError("Undefined browser action") if url is None: - raise ValueError('Request is not a goto-request and does not follow a response') + raise ValueError( + "Request is not a goto-request and does not follow a response" + ) super().__init__(url, action, **kwargs) self.context_id = context_id self.page_id = page_id diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 2011440..efdef1e 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -1,18 +1,20 @@ from typing import Tuple, Union +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import TextResponse from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction +import warnings -class PuppeteerResponse(TextResponse): +class PuppeteerResponse(TextResponse): attributes: Tuple[str, ...] = TextResponse.attributes + ( - 'url', - 'puppeteer_request', - 'context_id', - 'page_id' + "url", + "puppeteer_request", + "context_id", + "page_id", ) """ A tuple of :class:`str` objects containing the name of all public @@ -22,22 +24,26 @@ class PuppeteerResponse(TextResponse): Currently used by :meth:`PuppeteerResponse.replace`. """ - def __init__(self, - url: str, - puppeteer_request: PuppeteerRequest, - context_id: str, - page_id: str, - **kwargs): + def __init__( + self, + url: str, + puppeteer_request: PuppeteerRequest, + context_id: str, + page_id: str, + **kwargs + ): self.puppeteer_request = puppeteer_request self.context_id = context_id self.page_id = page_id super().__init__(url, **kwargs) - def follow(self, - action: Union[str, PuppeteerServiceAction], - close_page=True, - accumulate_meta: bool = False, - **kwargs) -> PuppeteerRequest: + def follow( + self, + action: Union[str, PuppeteerServiceAction], + close_page=True, + accumulate_meta: bool = False, + **kwargs + ) -> PuppeteerRequest: """ Execute action on the same browser page. @@ -53,16 +59,17 @@ def follow(self, elif isinstance(action, GoTo): action.url = self.urljoin(action.url) else: - kwargs['url'] = self.url - kwargs['dont_filter'] = True + kwargs["url"] = self.url + kwargs["dont_filter"] = True if accumulate_meta: - kwargs['meta'] = { - **self.meta, - **kwargs.pop('meta', {}) - } - return PuppeteerRequest(action, - context_id=self.context_id, page_id=page_id, - close_page=close_page, **kwargs) + kwargs["meta"] = {**self.meta, **kwargs.pop("meta", {})} + return PuppeteerRequest( + action, + context_id=self.context_id, + page_id=page_id, + close_page=close_page, + **kwargs + ) class PuppeteerHtmlResponse(PuppeteerResponse): @@ -71,10 +78,7 @@ class PuppeteerHtmlResponse(PuppeteerResponse): Additionally, exposes received html and cookies via corresponding attributes. """ - attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ( - 'html', - 'cookies' - ) + attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("html", "cookies") """ A tuple of :class:`str` objects containing the name of all public attributes of the class that are also keyword parameters of the @@ -84,11 +88,11 @@ class PuppeteerHtmlResponse(PuppeteerResponse): """ def __init__(self, url, puppeteer_request, context_id, page_id, **kwargs): - self.html = kwargs.pop('html') - self.cookies = kwargs.pop('cookies') - kwargs.setdefault('body', self.html) - kwargs.setdefault('encoding', 'utf-8') - kwargs.setdefault('headers', {}).setdefault('Content-Type', 'text/html') + self.html = kwargs.pop("html") + self.cookies = kwargs.pop("cookies") + kwargs.setdefault("body", self.html) + kwargs.setdefault("encoding", "utf-8") + kwargs.setdefault("headers", {}).setdefault("Content-Type", "text/html") super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) @@ -98,26 +102,58 @@ class PuppeteerScreenshotResponse(PuppeteerResponse): Screenshot is available via self.screenshot as base64 encoded string. """ - attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ( - 'screenshot', - ) + attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("screenshot",) def __init__(self, url, puppeteer_request, context_id, page_id, **kwargs): - self.screenshot = kwargs.pop('screenshot') + self.screenshot = kwargs.pop("screenshot") super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) class PuppeteerJsonResponse(PuppeteerResponse): """ - Response for CustomJsAction and RecaptchaSolver. + Response for CustomJsAction. Result is available via self.data object. """ - attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ( - 'data', - ) + attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("data",) def __init__(self, url, puppeteer_request, context_id, page_id, data, **kwargs): - kwargs['headers'] = {'Content-Type': 'application/json'} + kwargs["headers"] = {"Content-Type": "application/json"} self.data = data super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) + + +class PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse): + """ + Response for RecaptchaSolver. + Result is available via self.recaptcha_data and self.data["recaptcha_data"] + (deprecated, to be deleted in next versions) object. + """ + + attributes: Tuple[str, ...] = tuple( + set(PuppeteerHtmlResponse.attributes + PuppeteerJsonResponse.attributes) + ) + ("recaptcha_data",) + + @property + def data(self): + warnings.warn( + "self.data['recaptcha_data'] is deprecated and staged to remove in next versions. " + "Use self.recaptcha_data instead.", + ScrapyDeprecationWarning, + stacklevel=2, + ) + return self._data + + @data.setter + def data(self, value): + self._data = value + + def __init__( + self, url, puppeteer_request, context_id, page_id, recaptcha_data, **kwargs + ): + kwargs["headers"] = {"Content-Type": "application/json"} + self._data = {"recaptcha_data": recaptcha_data} + self.recaptcha_data = recaptcha_data + super().__init__( + url, puppeteer_request, context_id, page_id, self._data, **kwargs + ) diff --git a/setup.py b/setup.py index 576b669..bc0ce3d 100644 --- a/setup.py +++ b/setup.py @@ -6,31 +6,31 @@ long_description = readme.read() setup( - name='scrapy-puppeteer-client', - version='0.1.5', - description='A library to use Puppeteer-managed browser in Scrapy spiders', + name="scrapy-puppeteer-client", + version="0.1.5", + description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/ispras/scrapy-puppeteer', - author='MODIS @ ISP RAS', - maintainer='Maksim Varlamov', - maintainer_email='varlamov@ispras.ru', - packages=['scrapypuppeteer'], - install_requires=['scrapy>=2.6'], - python_requires='>=3.6', - license='BSD', + url="https://github.com/ispras/scrapy-puppeteer", + author="MODIS @ ISP RAS", + maintainer="Maksim Varlamov", + maintainer_email="varlamov@ispras.ru", + packages=["scrapypuppeteer"], + install_requires=["scrapy>=2.6"], + python_requires=">=3.6", + license="BSD", classifiers=[ - 'Development Status :: 3 - Alpha', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Framework :: Scrapy', - 'Intended Audience :: Developers', - 'Operating System :: OS Independent', - 'License :: OSI Approved :: BSD License' - ] + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Framework :: Scrapy", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "License :: OSI Approved :: BSD License", + ], ) diff --git a/tests/actions/constants.py b/tests/actions/constants.py index c30cbcc..36ce7f8 100644 --- a/tests/actions/constants.py +++ b/tests/actions/constants.py @@ -13,10 +13,12 @@ def __gen_nav_opts(): for opt_num in range(1, 5): for comb in combinations(WAIT_UNTIL, opt_num): timeout = randint(0, 100) * 1000 - options.append({ - 'timeout': timeout, - 'waitUntil': list(comb), - }) + options.append( + { + "timeout": timeout, + "waitUntil": list(comb), + } + ) return options diff --git a/tests/actions/test_actions.py b/tests/actions/test_actions.py index f3f6e4b..e3a36b4 100644 --- a/tests/actions/test_actions.py +++ b/tests/actions/test_actions.py @@ -7,9 +7,9 @@ def _gen_goto(): for url, nav_opt, wait_opt in product(URLS, NAV_OPTS, WAIT_OPTS): expected = { - 'url': url, - 'navigationOptions': nav_opt, - 'waitOptions': wait_opt, + "url": url, + "navigationOptions": nav_opt, + "waitOptions": wait_opt, } yield url, nav_opt, wait_opt, expected @@ -17,62 +17,58 @@ def _gen_goto(): def _gen_back_forward(): for nav_opt, wait_opt in product(NAV_OPTS, WAIT_OPTS): expected = { - 'navigationOptions': nav_opt, - 'waitOptions': wait_opt, + "navigationOptions": nav_opt, + "waitOptions": wait_opt, } yield nav_opt, wait_opt, expected def _gen_click(): - for selector, click_opt, nav_opt, wait_opt in product(SELECTORS, CLICK_OPTS, NAV_OPTS, WAIT_OPTS): + for selector, click_opt, nav_opt, wait_opt in product( + SELECTORS, CLICK_OPTS, NAV_OPTS, WAIT_OPTS + ): expected = { - 'selector': selector, - 'clickOptions': click_opt, - 'waitOptions': wait_opt, - 'navigationOptions': nav_opt, + "selector": selector, + "clickOptions": click_opt, + "waitOptions": wait_opt, + "navigationOptions": nav_opt, } yield selector, click_opt, nav_opt, wait_opt, expected def _gen_scroll(): for selector, wait_opt in product(SELECTORS, WAIT_OPTS): - expected = { - 'selector': selector, - 'waitOptions': wait_opt - } + expected = {"selector": selector, "waitOptions": wait_opt} yield selector, wait_opt, expected -@mark.parametrize("url, navigation_options, wait_options, expected", - _gen_goto()) +@mark.parametrize("url, navigation_options, wait_options, expected", _gen_goto()) def test_goto(url, navigation_options, wait_options, expected): action = GoTo(url, navigation_options, wait_options) assert action.payload() == expected -@mark.parametrize("navigation_options, wait_options, expected", - _gen_back_forward()) +@mark.parametrize("navigation_options, wait_options, expected", _gen_back_forward()) def test_go_forward(navigation_options, wait_options, expected): action = GoForward(navigation_options, wait_options) assert action.payload() == expected -@mark.parametrize("navigation_options, wait_options, expected", - _gen_back_forward()) +@mark.parametrize("navigation_options, wait_options, expected", _gen_back_forward()) def test_go_forward(navigation_options, wait_options, expected): action = GoBack(navigation_options, wait_options) assert action.payload() == expected -@mark.parametrize("selector, click_options, navigation_options, wait_options, expected", - _gen_click()) +@mark.parametrize( + "selector, click_options, navigation_options, wait_options, expected", _gen_click() +) def test_click(selector, click_options, navigation_options, wait_options, expected): action = Click(selector, click_options, wait_options, navigation_options) assert action.payload() == expected -@mark.parametrize("selector, wait_options, expected", - _gen_scroll()) +@mark.parametrize("selector, wait_options, expected", _gen_scroll()) def test_scroll(selector, wait_options, expected): action = Scroll(selector, wait_options) assert action.payload() == expected diff --git a/tests/middleware/test_middleware.py b/tests/middleware/test_middleware.py index 13767c9..fa37c00 100644 --- a/tests/middleware/test_middleware.py +++ b/tests/middleware/test_middleware.py @@ -4,6 +4,7 @@ ClickSpider, ScreenshotSpider, CustomJsActionSpider, + RecaptchaSolverSpider, ) from tests.mockserver import MockServer from twisted.trial.unittest import TestCase @@ -13,16 +14,16 @@ class PuppeteerCrawlTest(TestCase): SETTINGS = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042 }, - 'PUPPETEER_SERVICE_URL': None, + "PUPPETEER_SERVICE_URL": None, } def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() - self.SETTINGS['PUPPETEER_SERVICE_URL'] = self.mockserver.http_address + self.SETTINGS["PUPPETEER_SERVICE_URL"] = self.mockserver.http_address def tearDown(self): self.mockserver.__exit__(None, None, None) @@ -30,7 +31,7 @@ def tearDown(self): def _start_testing(self, spider_cls, expected): crawler = get_crawler(spider_cls, self.SETTINGS) yield crawler.crawl(mockserver=self.mockserver) - self.assertEqual(len(crawler.spider.urls_visited), expected) + self.assertEqual(expected, len(crawler.spider.urls_visited)) @defer.inlineCallbacks def test_goto(self): @@ -51,3 +52,7 @@ def test_screenshot(self): @defer.inlineCallbacks def test_custom_js_action(self): yield from self._start_testing(CustomJsActionSpider, 1) + + @defer.inlineCallbacks + def test_recaptcha_solver(self): + yield from self._start_testing(RecaptchaSolverSpider, 1) diff --git a/tests/mockserver.py b/tests/mockserver.py index 9d3bb60..99f83c0 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -6,6 +6,8 @@ from subprocess import PIPE, Popen from typing import Dict from secrets import token_hex +from json import dumps +from base64 import b64encode from twisted.internet import reactor from twisted.internet.protocol import ServerFactory @@ -16,11 +18,11 @@ from scrapy.utils.python import to_bytes -def getarg(request, name, default=None, type=None): +def get_arg(request, name, default=None, arg_type=None): if name in request.args: value = request.args[name][0] - if type is not None: - value = type(value) + if arg_type is not None: + value = arg_type(value) return value return default @@ -29,9 +31,9 @@ def get_mockserver_env() -> Dict[str, str]: """Return an OS environment dict suitable to run mockserver processes.""" tests_path = Path(__file__).parent.parent - pythonpath = str(tests_path) + os.pathsep + os.environ.get("PYTHONPATH", "") + python_path = str(tests_path) + os.pathsep + os.environ.get("PYTHONPATH", "") env = os.environ.copy() - env["PYTHONPATH"] = pythonpath + env["PYTHONPATH"] = python_path return env @@ -39,117 +41,128 @@ class LeafResource(resource.Resource): isLeaf = True def render_POST(self, request): - page_id = getarg(request, b"pageId", default=None, type=str) - context_id = getarg(request, b"contextId", default=None, type=str) - close_page = getarg(request, b"closePage", default=0, type=bool) + page_id = get_arg(request, b"pageId", default=None, arg_type=str) + context_id = get_arg(request, b"contextId", default=None, arg_type=str) + close_page = get_arg(request, b"closePage", default=0, arg_type=bool) request.setHeader(b"Content-Type", b"application/json") - self.deferRequest(request, 0, self._render_request, request, page_id, context_id, close_page) + self.defer_request( + request, 0, self.render_request, request, page_id, context_id, close_page + ) return NOT_DONE_YET - def deferRequest(self, request, delay, f, *a, **kw): - def _cancelrequest(_): + @staticmethod + def defer_request(request, delay, render_func, *args, **kwargs): + def _cancel_request(_): # silence CancelledError d.addErrback(lambda _: None) d.cancel() - d = deferLater(reactor, delay, f, *a, **kw) - request.notifyFinish().addErrback(_cancelrequest) + d = deferLater(reactor, delay, render_func, *args, **kwargs) + request.notifyFinish().addErrback(_cancel_request) return d - def _render_request(self, request, page_id, context_id, close_page): + def render_request(self, request, page_id, context_id, close_page): + request.write( + to_bytes(dumps(self._form_response(page_id, context_id, close_page))) + ) + request.finish() + + def _form_response(self, page_id, context_id, close_page): raise NotImplementedError class GoTo(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - html = ''' + def _form_response(self, page_id, context_id, close_page): + html = """ - ''' - from json import dumps - response_data = { - 'contextId': token_hex(20), - 'pageId': token_hex(20), - 'html': html, - 'cookies': None + """ + return { + "contextId": token_hex(20), + "pageId": token_hex(20), + "html": html, + "cookies": None, } - request.write(to_bytes(dumps(response_data))) - request.finish() class GoForward(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - html = ''' + def _form_response(self, page_id, context_id, close_page): + html = """ went forward - ''' - from json import dumps - response_data = { - 'contextId': context_id, - 'pageId': page_id, - 'html': html, - 'cookies': None + """ + return { + "contextId": context_id, + "pageId": page_id, + "html": html, + "cookies": None, } - request.write(to_bytes(dumps(response_data))) - request.finish() class Back(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - html = ''' + def _form_response(self, page_id, context_id, close_page): + html = """ went back - ''' - from json import dumps - response_data = { - 'contextId': context_id, - 'pageId': page_id, - 'html': html, - 'cookies': None + """ + return { + "contextId": context_id, + "pageId": page_id, + "html": html, + "cookies": None, } - request.write(to_bytes(dumps(response_data))) - request.finish() class Click(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - html = ''' + def _form_response(self, page_id, context_id, close_page): + html = """ clicked - ''' - from json import dumps - response_data = { - 'contextId': context_id, - 'pageId': page_id, - 'html': html, - 'cookies': None + """ + return { + "contextId": context_id, + "pageId": page_id, + "html": html, + "cookies": None, } - request.write(to_bytes(dumps(response_data))) - request.finish() class Screenshot(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - from base64 import b64encode - from json import dumps - with open("./tests/scrapy_logo.png", 'rb') as image: - response_data = { - 'screenshot': b64encode(image.read()).decode(), + def _form_response(self, page_id, context_id, close_page): + with open("./tests/scrapy_logo.png", "rb") as image: + return { + "contextId": context_id, + "pageId": page_id, + "screenshot": b64encode(image.read()).decode(), } - request.write(to_bytes(dumps(response_data))) - request.finish() -class Action(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): - from json import dumps - response_data = { - 'field': "Hello!", +class RecaptchaSolver(LeafResource): + def _form_response(self, page_id, context_id, close_page): + html = """ + there is recaptcha on the page! + """ + return { + "contextId": context_id, + "pageId": page_id, + "html": html, + "cookies": None, + "recaptcha_data": { + "captchas": [1], # 1 captcha + "some_other_fields": [], + }, + } + + +class CustomJsAction(LeafResource): + def _form_response(self, page_id, context_id, close_page): + return { + "contextId": context_id, + "pageId": page_id, + "data": {"field": "Hello!"}, } - request.write(to_bytes(dumps(response_data))) - request.finish() class CloseContext(LeafResource): - def _render_request(self, request, page_id, context_id, close_page): + def render_request(self, request, page_id, context_id, close_page): request.finish() @@ -161,7 +174,8 @@ def __init__(self): self.putChild(b"back", Back()) self.putChild(b"click", Click()) self.putChild(b"screenshot", Screenshot()) - self.putChild(b"action", Action()) + self.putChild(b"action", CustomJsAction()) + self.putChild(b"recaptcha_solver", RecaptchaSolver()) self.putChild(b"close_context", CloseContext()) def getChild(self, name, request): @@ -188,23 +202,24 @@ def url(self, path): return host + path -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser() - parser.add_argument( - "-t", "--type", type=str, choices=("http",), default="http" - ) + parser.add_argument("-t", "--type", type=str, choices=("http",), default="http") args = parser.parse_args() if args.type == "http": root = Root() factory: ServerFactory = Site(root) - httpPort = reactor.listenTCP(0, factory) - + http_port = reactor.listenTCP(0, factory) def print_listening(): - http_host = httpPort.getHost() + http_host = http_port.getHost() http_address = f"http://{http_host.host}:{http_host.port}" print(http_address) reactor.callWhenRunning(print_listening) reactor.run() + + +if __name__ == "__main__": + main() diff --git a/tests/spiders.py b/tests/spiders.py index 0b4b8a5..dddcbc9 100644 --- a/tests/spiders.py +++ b/tests/spiders.py @@ -7,6 +7,7 @@ Click, Screenshot, CustomJsAction, + RecaptchaSolver, ) @@ -26,6 +27,10 @@ def __init__(self, *args, **kwargs): def closed(self, reason): self.meta["close_reason"] = reason + @staticmethod + def errback(failure): + print(failure) + class GoToSpider(MetaSpider): name = "goto" @@ -35,20 +40,20 @@ def __init__(self, *args, **kwargs): self.urls_visited = [] def start_requests(self): - yield PuppeteerRequest(GoTo("https://some_url.com"), - callback=self.parse, errback=self.errback, - close_page=False) + yield PuppeteerRequest( + GoTo("https://some_url.com"), + callback=self.parse, + errback=self.errback, + close_page=False, + ) def parse(self, response, **kwargs): - body = b''' + body = b""" - ''' + """ if response.body == body: self.urls_visited.append(response.url) - def errback(self, failure): - print(failure) - class ClickSpider(MetaSpider): name = "click" @@ -58,25 +63,28 @@ def __init__(self, *args, **kwargs): self.urls_visited = [] def start_requests(self): - yield PuppeteerRequest(GoTo("https://some_url.com"), - callback=self.click, errback=self.errback, - close_page=False) + yield PuppeteerRequest( + GoTo("https://some_url.com"), + callback=self.click, + errback=self.errback, + close_page=False, + ) def click(self, response, **kwargs): - yield response.follow(Click("the_selector"), - callback=self.parse, errback=self.errback, - close_page=False) + yield response.follow( + Click("the_selector"), + callback=self.parse, + errback=self.errback, + close_page=False, + ) def parse(self, response, **kwargs): - body = b''' + body = b""" clicked - ''' + """ if response.body == body: self.urls_visited.append(response.url) - def errback(self, failure): - print(failure) - class ScreenshotSpider(MetaSpider): name = "screenshot" @@ -86,24 +94,25 @@ def __init__(self, *args, **kwargs): self.urls_visited = [] def start_requests(self): - yield PuppeteerRequest(GoTo("https://some_url.com"), - callback=self.screenshot, errback=self.errback, - close_page=False) + yield PuppeteerRequest( + GoTo("https://some_url.com"), + callback=self.screenshot, + errback=self.errback, + close_page=False, + ) def screenshot(self, response, **kwargs): - yield response.follow(Screenshot(), - callback=self.parse, errback=self.errback, - close_page=False) + yield response.follow( + Screenshot(), callback=self.parse, errback=self.errback, close_page=False + ) def parse(self, response, **kwargs): from base64 import b64encode - with open("./tests/scrapy_logo.png", 'rb') as image: + + with open("./tests/scrapy_logo.png", "rb") as image: if b64encode(image.read()).decode() == response.screenshot: self.urls_visited.append(response.url) - def errback(self, failure): - print(failure) - class CustomJsActionSpider(MetaSpider): name = "custom_js_action" @@ -113,28 +122,29 @@ def __init__(self, *args, **kwargs): self.urls_visited = [] def start_requests(self): - yield PuppeteerRequest(GoTo("https://some_url.com"), - callback=self.action, errback=self.errback, - close_page=False) + yield PuppeteerRequest( + GoTo("https://some_url.com"), + callback=self.action, + errback=self.errback, + close_page=False, + ) def action(self, response, **kwargs): - js_function = ''' + js_function = """ some js function - ''' - yield response.follow(CustomJsAction(js_function), - callback=self.parse, errback=self.errback, - close_page=False) + """ + yield response.follow( + CustomJsAction(js_function), + callback=self.parse, + errback=self.errback, + close_page=False, + ) def parse(self, response, **kwargs): - response_data = { - 'field': "Hello!" - } + response_data = {"field": "Hello!"} if response.data == response_data: self.urls_visited.append(response.url) - def errback(self, failure): - print(failure) - class GoBackForwardSpider(MetaSpider): name = "go_back_forward" @@ -144,36 +154,69 @@ def __init__(self, *args, **kwargs): self.urls_visited = [] def start_requests(self): - yield PuppeteerRequest(GoTo("https://some_url.com"), - callback=self.go_next, errback=self.errback, - close_page=False) + yield PuppeteerRequest( + GoTo("https://some_url.com"), + callback=self.go_next, + errback=self.errback, + close_page=False, + ) def go_next(self, response, **kwargs): - yield response.follow(GoTo("/article"), - callback=self.go_back, errback=self.errback, - close_page=False) + yield response.follow( + GoTo("/article"), + callback=self.go_back, + errback=self.errback, + close_page=False, + ) def go_back(self, response, **kwargs): - yield response.follow(GoBack(), - callback=self.go_forward, errback=self.errback, - close_page=False) + yield response.follow( + GoBack(), callback=self.go_forward, errback=self.errback, close_page=False + ) def go_forward(self, response, **kwargs): - body = b''' + body = b""" went back - ''' + """ assert response.body == body - yield response.follow(GoForward(), - callback=self.parse, errback=self.errback, - close_page=False) + yield response.follow( + GoForward(), callback=self.parse, errback=self.errback, close_page=False + ) def parse(self, response, **kwargs): - body = b''' + body = b""" went forward - ''' + """ if response.body == body: self.urls_visited.append(response.url) - def errback(self, failure): - print(failure) + +class RecaptchaSolverSpider(MetaSpider): + name = "recaptcha_solver" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.urls_visited = [] + + def start_requests(self): + yield PuppeteerRequest( + GoTo("https://some_url.com/with_captcha"), + callback=self.solve_recaptcha, + errback=self.errback, + close_page=False, + ) + + def solve_recaptcha(self, response, **kwargs): + yield response.follow( + RecaptchaSolver(solve_recaptcha=True), + callback=self.parse, + errback=self.errback, + close_page=False, + ) + + def parse(self, response, **kwargs): + if response.data["recaptcha_data"]["captchas"] == [ + 1 + ] and response.recaptcha_data["captchas"] == [1]: + self.urls_visited.append(response.url)