From 29588154936c715305f4a1e4bccaf543daeae411 Mon Sep 17 00:00:00 2001 From: AndrewKorzh <92707967+AndrewKorzh@users.noreply.github.com> Date: Fri, 26 Jul 2024 14:24:32 +0300 Subject: [PATCH] splitting into abstract classes --- scrapypuppeteer/middleware.py | 299 ++++++++++++++++++++-------------- 1 file changed, 180 insertions(+), 119 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 91abf16..1011fda 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import List, Union from urllib.parse import urlencode, urljoin +from abc import ABC, abstractmethod from scrapy import signals from scrapy.crawler import Crawler @@ -31,112 +32,101 @@ from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest from scrapypuppeteer.scrappypyppeteer import LocalScrapyPyppeteer -import asyncio +class BrowserManager(ABC): + @abstractmethod + def process_request(self, request, spider): + pass + + @abstractmethod + def close_used_contexts(self): + pass -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service right before - spider is closed. - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' +class LocalBrowserManager(BrowserManager): + def __init__(self): + self.local_scrapy_pyppeteer = LocalScrapyPyppeteer() - Settings: + def process_request(self, request): + pyp_request = self.process_puppeteer_request(request) + return pyp_request - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' + def process_puppeteer_request(self, request: PuppeteerRequest): + action = request.action + service_url = 'http://_running_local_' + service_params = self._encode_service_params(request) + if service_params: + service_url += "?" + service_params - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overridden per request. - By default, only cookies are sent. + meta = { + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, + } - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ + action_request = ActionRequest( + url=service_url, + action=action, + cookies=request.cookies, + meta=meta, + ) + puppeteer_response = self.local_scrapy_pyppeteer.process_puppeteer_request(action_request) - SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" - INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" - SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" - DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + return puppeteer_response - PUPPETEER_LOCAL_SETTING = "PUPPETEER_LOCAL" + @staticmethod + def _encode_service_params(request): + service_params = {} + if request.context_id is not None: + service_params["contextId"] = request.context_id + if request.page_id is not None: + service_params["pageId"] = request.page_id + if request.close_page: + service_params["closePage"] = 1 + return urlencode(service_params) + + def close_used_contexts(self): + self.local_scrapy_pyppeteer.context_manager.close_browser() - service_logger = logging.getLogger(__name__) - def __init__( - self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool, - local_mode: bool, - local_scrapy_pyppeteer: LocalScrapyPyppeteer - ): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - self.local_mode = local_mode - self.local_scrapy_pyppeteer = local_scrapy_pyppeteer - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - local_mode = crawler.settings.getbool(cls.PUPPETEER_LOCAL_SETTING, False) - local_scrapy_pyppeteer = None - if local_mode: - print("\n\nLOCAL MODE\n\n") - local_scrapy_pyppeteer = LocalScrapyPyppeteer() - if local_mode: - service_url = 'http://_running_local_' +class ServiceBrowserManager(BrowserManager): + def __init__(self, service_base_url, include_meta, include_headers, crawler): + #### добавить передачу этих параметров #### + self.service_base_url = service_base_url + self.include_meta = include_meta + self.include_headers = include_headers + self.used_contexts = defaultdict(set) + self.service_logger = logging.getLogger(__name__) + self.crawler = crawler - if service_url is None: + if self.service_base_url is None: raise ValueError("Puppeteer service URL must be provided") - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - middleware = cls(crawler, service_url, include_headers, include_meta, local_mode, local_scrapy_pyppeteer) - crawler.signals.connect( - middleware.close_used_contexts, signal=signals.spider_idle - ) - return middleware - def process_request(self, request, spider): - + + def process_request(self, request): if isinstance(request, CloseContextRequest): return self.process_close_context_request(request) if isinstance(request, PuppeteerRequest): return self.process_puppeteer_request(request) - + def process_close_context_request(self, request: CloseContextRequest): if not request.is_valid_url: return request.replace( url=urljoin(self.service_base_url, "/close_context"), ) - + def process_puppeteer_request(self, request: PuppeteerRequest): action = request.action service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) + if service_params: service_url += "?" + service_params + meta = { "puppeteer_request": request, "dont_obey_robotstxt": True, @@ -145,7 +135,7 @@ def process_puppeteer_request(self, request: PuppeteerRequest): if self.include_meta: meta = {**request.meta, **meta} - action_request = ActionRequest( + action_request = ActionRequest( url=service_url, action=action, method="POST", @@ -159,17 +149,8 @@ def process_puppeteer_request(self, request: PuppeteerRequest): errback=request.errback, meta=meta, ) - print("Request\n") - print(action_request.url) - print() - - if self.local_mode: - puppeteer_response = self.local_scrapy_pyppeteer.process_puppeteer_request(action_request) - print(action_request.action.payload()) - - return puppeteer_response return action_request - + @staticmethod def _encode_service_params(request): service_params = {} @@ -180,6 +161,7 @@ def _encode_service_params(request): if request.close_page: service_params["closePage"] = 1 return urlencode(service_params) + def _serialize_body(self, action, request): payload = action.payload() @@ -205,13 +187,124 @@ def _serialize_body(self, action, request): return json.dumps(payload) return str(payload) + def close_used_contexts(self, spider): + contexts = list(self.used_contexts.pop(id(spider), set())) + if contexts: + request = CloseContextRequest( + contexts, + meta={"proxy": None}, + ) + + def handle_close_contexts_result(result): + if isinstance(result, Response): + if result.status == 200: + self.service_logger.debug( + f"Successfully closed {len(request.contexts)} " + f"contexts with request {result.request}" + ) + else: + self.service_logger.warning( + f"Could not close contexts: {result.text}" + ) + elif isinstance(result, Failure): + self.service_logger.warning( + f"Could not close contexts: {result.value}", + exc_info=failure_to_exc_info(result), + ) + + dfd = self.crawler.engine.download(request) + dfd.addBoth(handle_close_contexts_result) + + raise DontCloseSpider() + + + +class PuppeteerServiceDownloaderMiddleware: + """ + This downloader middleware converts PuppeteerRequest instances to + Puppeteer service API requests and then converts its responses to + PuppeteerResponse instances. Additionally, it tracks all browser contexts + that spider uses and performs cleanup request to service right before + spider is closed. + + Additionally, the middleware uses these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' + + Settings: + + PUPPETEER_SERVICE_URL (str) + Service URL, e.g. 'http://localhost:3000' + + PUPPETEER_INCLUDE_HEADERS (bool|list[str]) + Determines which request headers will be sent to remote site by puppeteer service. + Either True (all headers), False (no headers) or list of header names. + May be overridden per request. + By default, only cookies are sent. + + PUPPETEER_INCLUDE_META (bool) + Determines whether to send or not user's meta attached by user. + Default to False. + """ + + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + PUPPETEER_LOCAL_SETTING = "PUPPETEER_LOCAL" + + service_logger = logging.getLogger(__name__) + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + local_mode: bool, + browser_manager: Union[ServiceBrowserManager, LocalBrowserManager] + ): + self.service_base_url = service_url + self.include_headers = include_headers + self.include_meta = include_meta + self.crawler = crawler + self.used_contexts = defaultdict(set) + self.local_mode = local_mode + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler): + service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + local_mode = crawler.settings.getbool(cls.PUPPETEER_LOCAL_SETTING, False) + if cls.INCLUDE_HEADERS_SETTING in crawler.settings: + try: + include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) + except ValueError: + include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) + else: + include_headers = cls.DEFAULT_INCLUDE_HEADERS + include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + + + if local_mode: + browser_manager = LocalBrowserManager() + else: + browser_manager = ServiceBrowserManager(service_url, include_meta, include_headers, crawler) + + middleware = cls(crawler, service_url, include_headers, include_meta, local_mode, browser_manager) + crawler.signals.connect( + middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + ) + return middleware - def process_response(self, request, response, spider): - print(f"\n\n\n\nProcessing responce\nlocal_mode = {self.local_mode}\n\n\n") + def process_request(self, request, spider): + return self.browser_manager.process_request(request) - + + def process_response(self, request, response, spider): if not isinstance(response, TextResponse): return response @@ -272,38 +365,6 @@ def _get_response_class(request_action): return PuppeteerRecaptchaSolverResponse return PuppeteerJsonResponse - def close_used_contexts(self, spider): - contexts = list(self.used_contexts.pop(id(spider), set())) - if contexts: - request = CloseContextRequest( - contexts, - meta={"proxy": None}, - ) - - def handle_close_contexts_result(result): - if isinstance(result, Response): - if result.status == 200: - self.service_logger.debug( - f"Successfully closed {len(request.contexts)} " - f"contexts with request {result.request}" - ) - else: - self.service_logger.warning( - f"Could not close contexts: {result.text}" - ) - elif isinstance(result, Failure): - self.service_logger.warning( - f"Could not close contexts: {result.value}", - exc_info=failure_to_exc_info(result), - ) - - dfd = self.crawler.engine.download(request) - dfd.addBoth(handle_close_contexts_result) - - raise DontCloseSpider() - - - class PuppeteerRecaptchaDownloaderMiddleware: