Skip to content

Commit

Permalink
splitting into abstract classes
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewKorzh committed Jul 26, 2024
1 parent ef4666a commit 2958815
Showing 1 changed file with 180 additions and 119 deletions.
299 changes: 180 additions & 119 deletions scrapypuppeteer/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
from typing import List, Union
from urllib.parse import urlencode, urljoin
from abc import ABC, abstractmethod

from scrapy import signals
from scrapy.crawler import Crawler
Expand Down Expand Up @@ -31,112 +32,101 @@
from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest
from scrapypuppeteer.scrappypyppeteer import LocalScrapyPyppeteer

import asyncio

class BrowserManager(ABC):
@abstractmethod
def process_request(self, request, spider):
pass

@abstractmethod
def close_used_contexts(self):
pass

class PuppeteerServiceDownloaderMiddleware:
"""
This downloader middleware converts PuppeteerRequest instances to
Puppeteer service API requests and then converts its responses to
PuppeteerResponse instances. Additionally, it tracks all browser contexts
that spider uses and performs cleanup request to service right before
spider is closed.

Additionally, the middleware uses these meta-keys, do not use them, because their changing
could possibly (almost probably) break determined behaviour:
'puppeteer_request', 'dont_obey_robotstxt', 'proxy'
class LocalBrowserManager(BrowserManager):
def __init__(self):
self.local_scrapy_pyppeteer = LocalScrapyPyppeteer()

Settings:
def process_request(self, request):
pyp_request = self.process_puppeteer_request(request)
return pyp_request

PUPPETEER_SERVICE_URL (str)
Service URL, e.g. 'http://localhost:3000'
def process_puppeteer_request(self, request: PuppeteerRequest):
action = request.action
service_url = 'http://_running_local_'
service_params = self._encode_service_params(request)
if service_params:
service_url += "?" + service_params

PUPPETEER_INCLUDE_HEADERS (bool|list[str])
Determines which request headers will be sent to remote site by puppeteer service.
Either True (all headers), False (no headers) or list of header names.
May be overridden per request.
By default, only cookies are sent.
meta = {
"puppeteer_request": request,
"dont_obey_robotstxt": True,
"proxy": None,
}

PUPPETEER_INCLUDE_META (bool)
Determines whether to send or not user's meta attached by user.
Default to False.
"""
action_request = ActionRequest(
url=service_url,
action=action,
cookies=request.cookies,
meta=meta,
)
puppeteer_response = self.local_scrapy_pyppeteer.process_puppeteer_request(action_request)

SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL"
INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS"
SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META"
DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately
return puppeteer_response

PUPPETEER_LOCAL_SETTING = "PUPPETEER_LOCAL"
@staticmethod
def _encode_service_params(request):
service_params = {}
if request.context_id is not None:
service_params["contextId"] = request.context_id
if request.page_id is not None:
service_params["pageId"] = request.page_id
if request.close_page:
service_params["closePage"] = 1
return urlencode(service_params)

def close_used_contexts(self):
self.local_scrapy_pyppeteer.context_manager.close_browser()

service_logger = logging.getLogger(__name__)

def __init__(
self,
crawler: Crawler,
service_url: str,
include_headers: Union[bool, List[str]],
include_meta: bool,
local_mode: bool,
local_scrapy_pyppeteer: LocalScrapyPyppeteer
):
self.service_base_url = service_url
self.include_headers = include_headers
self.include_meta = include_meta
self.crawler = crawler
self.used_contexts = defaultdict(set)
self.local_mode = local_mode
self.local_scrapy_pyppeteer = local_scrapy_pyppeteer

@classmethod
def from_crawler(cls, crawler):
service_url = crawler.settings.get(cls.SERVICE_URL_SETTING)
local_mode = crawler.settings.getbool(cls.PUPPETEER_LOCAL_SETTING, False)
local_scrapy_pyppeteer = None
if local_mode:
print("\n\nLOCAL MODE\n\n")
local_scrapy_pyppeteer = LocalScrapyPyppeteer()

if local_mode:
service_url = 'http://_running_local_'
class ServiceBrowserManager(BrowserManager):
def __init__(self, service_base_url, include_meta, include_headers, crawler):
#### добавить передачу этих параметров ####
self.service_base_url = service_base_url
self.include_meta = include_meta
self.include_headers = include_headers
self.used_contexts = defaultdict(set)
self.service_logger = logging.getLogger(__name__)
self.crawler = crawler

if service_url is None:
if self.service_base_url is None:
raise ValueError("Puppeteer service URL must be provided")
if cls.INCLUDE_HEADERS_SETTING in crawler.settings:
try:
include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING)
except ValueError:
include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING)
else:
include_headers = cls.DEFAULT_INCLUDE_HEADERS
include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False)
middleware = cls(crawler, service_url, include_headers, include_meta, local_mode, local_scrapy_pyppeteer)
crawler.signals.connect(
middleware.close_used_contexts, signal=signals.spider_idle
)
return middleware

def process_request(self, request, spider):

def process_request(self, request):
if isinstance(request, CloseContextRequest):
return self.process_close_context_request(request)

if isinstance(request, PuppeteerRequest):
return self.process_puppeteer_request(request)

def process_close_context_request(self, request: CloseContextRequest):
if not request.is_valid_url:
return request.replace(
url=urljoin(self.service_base_url, "/close_context"),
)

def process_puppeteer_request(self, request: PuppeteerRequest):
action = request.action
service_url = urljoin(self.service_base_url, action.endpoint)
service_params = self._encode_service_params(request)

if service_params:
service_url += "?" + service_params


meta = {
"puppeteer_request": request,
"dont_obey_robotstxt": True,
Expand All @@ -145,7 +135,7 @@ def process_puppeteer_request(self, request: PuppeteerRequest):
if self.include_meta:
meta = {**request.meta, **meta}

action_request = ActionRequest(
action_request = ActionRequest(
url=service_url,
action=action,
method="POST",
Expand All @@ -159,17 +149,8 @@ def process_puppeteer_request(self, request: PuppeteerRequest):
errback=request.errback,
meta=meta,
)
print("Request\n")
print(action_request.url)
print()

if self.local_mode:
puppeteer_response = self.local_scrapy_pyppeteer.process_puppeteer_request(action_request)
print(action_request.action.payload())

return puppeteer_response
return action_request

@staticmethod
def _encode_service_params(request):
service_params = {}
Expand All @@ -180,6 +161,7 @@ def _encode_service_params(request):
if request.close_page:
service_params["closePage"] = 1
return urlencode(service_params)


def _serialize_body(self, action, request):
payload = action.payload()
Expand All @@ -205,13 +187,124 @@ def _serialize_body(self, action, request):
return json.dumps(payload)
return str(payload)

def close_used_contexts(self, spider):
contexts = list(self.used_contexts.pop(id(spider), set()))
if contexts:
request = CloseContextRequest(
contexts,
meta={"proxy": None},
)

def handle_close_contexts_result(result):
if isinstance(result, Response):
if result.status == 200:
self.service_logger.debug(
f"Successfully closed {len(request.contexts)} "
f"contexts with request {result.request}"
)
else:
self.service_logger.warning(
f"Could not close contexts: {result.text}"
)
elif isinstance(result, Failure):
self.service_logger.warning(
f"Could not close contexts: {result.value}",
exc_info=failure_to_exc_info(result),
)

dfd = self.crawler.engine.download(request)
dfd.addBoth(handle_close_contexts_result)

raise DontCloseSpider()



class PuppeteerServiceDownloaderMiddleware:
"""
This downloader middleware converts PuppeteerRequest instances to
Puppeteer service API requests and then converts its responses to
PuppeteerResponse instances. Additionally, it tracks all browser contexts
that spider uses and performs cleanup request to service right before
spider is closed.
Additionally, the middleware uses these meta-keys, do not use them, because their changing
could possibly (almost probably) break determined behaviour:
'puppeteer_request', 'dont_obey_robotstxt', 'proxy'
Settings:
PUPPETEER_SERVICE_URL (str)
Service URL, e.g. 'http://localhost:3000'
PUPPETEER_INCLUDE_HEADERS (bool|list[str])
Determines which request headers will be sent to remote site by puppeteer service.
Either True (all headers), False (no headers) or list of header names.
May be overridden per request.
By default, only cookies are sent.
PUPPETEER_INCLUDE_META (bool)
Determines whether to send or not user's meta attached by user.
Default to False.
"""

SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL"
INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS"
SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META"
DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately

PUPPETEER_LOCAL_SETTING = "PUPPETEER_LOCAL"

service_logger = logging.getLogger(__name__)

def __init__(
self,
crawler: Crawler,
service_url: str,
include_headers: Union[bool, List[str]],
include_meta: bool,
local_mode: bool,
browser_manager: Union[ServiceBrowserManager, LocalBrowserManager]
):
self.service_base_url = service_url
self.include_headers = include_headers
self.include_meta = include_meta
self.crawler = crawler
self.used_contexts = defaultdict(set)
self.local_mode = local_mode
self.browser_manager = browser_manager

@classmethod
def from_crawler(cls, crawler):
service_url = crawler.settings.get(cls.SERVICE_URL_SETTING)
local_mode = crawler.settings.getbool(cls.PUPPETEER_LOCAL_SETTING, False)
if cls.INCLUDE_HEADERS_SETTING in crawler.settings:
try:
include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING)
except ValueError:
include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING)
else:
include_headers = cls.DEFAULT_INCLUDE_HEADERS
include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False)


if local_mode:
browser_manager = LocalBrowserManager()
else:
browser_manager = ServiceBrowserManager(service_url, include_meta, include_headers, crawler)

middleware = cls(crawler, service_url, include_headers, include_meta, local_mode, browser_manager)
crawler.signals.connect(
middleware.browser_manager.close_used_contexts, signal=signals.spider_idle
)
return middleware


def process_response(self, request, response, spider):

print(f"\n\n\n\nProcessing responce\nlocal_mode = {self.local_mode}\n\n\n")
def process_request(self, request, spider):
return self.browser_manager.process_request(request)



def process_response(self, request, response, spider):
if not isinstance(response, TextResponse):
return response

Expand Down Expand Up @@ -272,38 +365,6 @@ def _get_response_class(request_action):
return PuppeteerRecaptchaSolverResponse
return PuppeteerJsonResponse

def close_used_contexts(self, spider):
contexts = list(self.used_contexts.pop(id(spider), set()))
if contexts:
request = CloseContextRequest(
contexts,
meta={"proxy": None},
)

def handle_close_contexts_result(result):
if isinstance(result, Response):
if result.status == 200:
self.service_logger.debug(
f"Successfully closed {len(request.contexts)} "
f"contexts with request {result.request}"
)
else:
self.service_logger.warning(
f"Could not close contexts: {result.text}"
)
elif isinstance(result, Failure):
self.service_logger.warning(
f"Could not close contexts: {result.value}",
exc_info=failure_to_exc_info(result),
)

dfd = self.crawler.engine.download(request)
dfd.addBoth(handle_close_contexts_result)

raise DontCloseSpider()





class PuppeteerRecaptchaDownloaderMiddleware:
Expand Down

0 comments on commit 2958815

Please sign in to comment.