Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.2.91 #26

Merged
merged 6 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from scrapling.parser import Adaptor, Adaptors

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2.9"
__version__ = "0.2.91"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
2 changes: 2 additions & 0 deletions scrapling/core/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
List, Literal, Optional, Pattern, Tuple, Type, Union)

SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]

try:
from typing import Protocol
except ImportError:
Expand Down
62 changes: 44 additions & 18 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from camoufox.sync_api import Camoufox

from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
Union)
SelectorWaitStates, Union)
from scrapling.core.utils import log
from scrapling.engines.toolbelt import (Response, StatusText,
async_intercept_route,
Expand All @@ -18,7 +18,7 @@ def __init__(
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
geoip: Optional[bool] = False,
adaptor_arguments: Dict = None,
Expand Down Expand Up @@ -84,6 +84,14 @@ def fetch(self, url: str) -> Response:
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
addons = [] if self.disable_ads else [DefaultAddons.UBO]
# Store the final response
final_response = None

def handle_response(finished_response):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

with Camoufox(
geoip=self.geoip,
proxy=self.proxy,
Expand All @@ -100,13 +108,15 @@ def fetch(self, url: str) -> Response:
page = browser.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)
if self.disable_resources:
page.route("**/*", intercept_route)

if self.extra_headers:
page.set_extra_http_headers(self.extra_headers)

res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
page.wait_for_load_state('networkidle')
Expand All @@ -123,21 +133,24 @@ def fetch(self, url: str) -> Response:
if self.network_idle:
page.wait_for_load_state('networkidle')

response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
headers=final_response.all_headers(),
request_headers=final_response.request.all_headers(),
**self.adaptor_arguments
)
page.close()
Expand All @@ -151,6 +164,14 @@ async def async_fetch(self, url: str) -> Response:
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
addons = [] if self.disable_ads else [DefaultAddons.UBO]
# Store the final response
final_response = None

async def handle_response(finished_response):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

async with AsyncCamoufox(
geoip=self.geoip,
proxy=self.proxy,
Expand All @@ -167,13 +188,15 @@ async def async_fetch(self, url: str) -> Response:
page = await browser.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)
if self.disable_resources:
await page.route("**/*", async_intercept_route)

if self.extra_headers:
await page.set_extra_http_headers(self.extra_headers)

res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
await page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
await page.wait_for_load_state('networkidle')
Expand All @@ -190,21 +213,24 @@ async def async_fetch(self, url: str) -> Response:
if self.network_idle:
await page.wait_for_load_state('networkidle')

response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=await page.content(),
body=(await page.content()).encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
headers=await res.all_headers(),
request_headers=await res.request.all_headers(),
headers=await final_response.all_headers(),
request_headers=await final_response.request.all_headers(),
**self.adaptor_arguments
)
await page.close()
Expand Down
65 changes: 47 additions & 18 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

from scrapling.core._types import Callable, Dict, Optional, Union
from scrapling.core._types import (Callable, Dict, Optional,
SelectorWaitStates, Union)
from scrapling.core.utils import log, lru_cache
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
NSTBROWSER_DEFAULT_QUERY)
Expand All @@ -23,7 +24,7 @@ def __init__(
page_action: Callable = None,
wait_selector: Optional[str] = None,
locale: Optional[str] = 'en-US',
wait_selector_state: Optional[str] = 'attached',
wait_selector_state: SelectorWaitStates = 'attached',
stealth: Optional[bool] = False,
real_chrome: Optional[bool] = False,
hide_canvas: Optional[bool] = False,
Expand Down Expand Up @@ -193,12 +194,21 @@ def fetch(self, url: str) -> Response:
:param url: Target url.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
from playwright.sync_api import Response as PlaywrightResponse
if not self.stealth or self.real_chrome:
# Because rebrowser_playwright doesn't play well with real browsers
from playwright.sync_api import sync_playwright
else:
from rebrowser_playwright.sync_api import sync_playwright

# Store the final response
final_response = None

def handle_response(finished_response: PlaywrightResponse):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

with sync_playwright() as p:
# Creating the browser
if self.cdp_url:
Expand All @@ -212,6 +222,8 @@ def fetch(self, url: str) -> Response:
page = context.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)

if self.extra_headers:
page.set_extra_http_headers(self.extra_headers)
Expand All @@ -223,7 +235,7 @@ def fetch(self, url: str) -> Response:
for script in self.__stealth_scripts():
page.add_init_script(path=script)

res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
page.wait_for_load_state('networkidle')
Expand All @@ -240,21 +252,24 @@ def fetch(self, url: str) -> Response:
if self.network_idle:
page.wait_for_load_state('networkidle')

response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
headers=final_response.all_headers(),
request_headers=final_response.request.all_headers(),
**self.adaptor_arguments
)
page.close()
Expand All @@ -266,12 +281,21 @@ async def async_fetch(self, url: str) -> Response:
:param url: Target url.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
from playwright.async_api import Response as PlaywrightResponse
if not self.stealth or self.real_chrome:
# Because rebrowser_playwright doesn't play well with real browsers
from playwright.async_api import async_playwright
else:
from rebrowser_playwright.async_api import async_playwright

# Store the final response
final_response = None

async def handle_response(finished_response: PlaywrightResponse):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

async with async_playwright() as p:
# Creating the browser
if self.cdp_url:
Expand All @@ -285,6 +309,8 @@ async def async_fetch(self, url: str) -> Response:
page = await context.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)

if self.extra_headers:
await page.set_extra_http_headers(self.extra_headers)
Expand All @@ -296,7 +322,7 @@ async def async_fetch(self, url: str) -> Response:
for script in self.__stealth_scripts():
await page.add_init_script(path=script)

res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
await page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
await page.wait_for_load_state('networkidle')
Expand All @@ -313,21 +339,24 @@ async def async_fetch(self, url: str) -> Response:
if self.network_idle:
await page.wait_for_load_state('networkidle')

response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=await page.content(),
body=(await page.content()).encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
headers=await res.all_headers(),
request_headers=await res.request.all_headers(),
headers=await final_response.all_headers(),
request_headers=await final_response.request.all_headers(),
**self.adaptor_arguments
)
await page.close()
Expand Down
6 changes: 1 addition & 5 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') ->
class Response(Adaptor):
"""This class is returned by all engines as a way to unify response type between different libraries."""

_is_response_result_logged = False # Class-level flag, initialized to False

def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
Expand All @@ -99,9 +97,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c
# For back-ward compatibility
self.adaptor = self
# For easier debugging while working from a Python shell
if not Response._is_response_result_logged:
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
Response._is_response_result_logged = True
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')

# def __repr__(self):
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
Expand Down
Loading
Loading