From 58151c12e8132c560e2114e77fe1e33fb066fd2c Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 13:41:06 +0200 Subject: [PATCH 1/9] Update the minimum browser version for headers generated --- scrapling/engines/toolbelt/fingerprints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapling/engines/toolbelt/fingerprints.py b/scrapling/engines/toolbelt/fingerprints.py index 71b8e84..9cd337d 100644 --- a/scrapling/engines/toolbelt/fingerprints.py +++ b/scrapling/engines/toolbelt/fingerprints.py @@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict: # So we don't raise any inconsistency red flags while websites fingerprinting us os_name = get_os_name() return HeaderGenerator( - browser=[Browser(name='chrome', min_version=128)], + browser=[Browser(name='chrome', min_version=130)], os=os_name, # None is ignored device='desktop' ).generate() From 7f330efc52a3ffd14d698b70bdb11894cb248a89 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 13:44:16 +0200 Subject: [PATCH 2/9] Turn-off the `hide_canvas` option by default --- scrapling/engines/pw.py | 2 +- scrapling/fetchers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 78c1d39..32c336a 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -28,7 +28,7 @@ def __init__( wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', stealth: bool = False, - hide_canvas: bool = True, + hide_canvas: bool = False, disable_webgl: bool = False, cdp_url: Optional[str] = None, nstbrowser_mode: bool = False, diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index 6594a68..1569c31 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -147,7 +147,7 @@ def fetch( self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None, useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', - hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, + hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, proxy: Optional[Union[str, Dict[str, str]]] = None, stealth: bool = False, cdp_url: Optional[str] = None, From 15b7e3aceeb9ccada736cad8c66806ea3d554e66 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 13:47:21 +0200 Subject: [PATCH 3/9] Stop using generated headers by browserforge It turns out its headers format is pretty outdated, which made Google reCaptcha not work sometimes --- scrapling/engines/pw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 32c336a..6aa89ec 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -130,8 +130,8 @@ def fetch(self, url: str) -> Response: extra_headers = {} useragent = self.useragent else: - extra_headers = generate_headers(browser_mode=True) - useragent = extra_headers.get('User-Agent') + extra_headers = {} + useragent = generate_headers(browser_mode=True).get('User-Agent') # Prepare the flags before diving flags = DEFAULT_STEALTH_FLAGS From 088117441d7fda0907e4b10b2b78486e8da32dde Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 13:49:31 +0200 Subject: [PATCH 4/9] Making the type hints more accurate before update --- scrapling/engines/pw.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 6aa89ec..e49f622 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -27,11 +27,11 @@ def __init__( page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', - stealth: bool = False, - hide_canvas: bool = False, - disable_webgl: bool = False, + stealth: Optional[bool] = False, + hide_canvas: Optional[bool] = False, + disable_webgl: Optional[bool] = False, cdp_url: Optional[str] = None, - nstbrowser_mode: bool = False, + nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None, google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, From e96e7d444000177ae23cbb368a8bbf1755dd1667 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 15:14:42 +0200 Subject: [PATCH 5/9] Adding the option to use real chrome browser directly --- scrapling/engines/pw.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index e49f622..80ff55b 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -28,6 +28,7 @@ def __init__( wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', stealth: Optional[bool] = False, + real_chrome: Optional[bool] = False, hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, cdp_url: Optional[str] = None, @@ -51,6 +52,7 @@ def __init__( :param wait_selector: Wait for a specific css selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`. :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently. + :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely. :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. @@ -67,6 +69,7 @@ def __init__( self.stealth = bool(stealth) self.hide_canvas = bool(hide_canvas) self.disable_webgl = bool(disable_webgl) + self.real_chrome = bool(real_chrome) self.google_search = bool(google_search) self.extra_headers = extra_headers or {} self.proxy = construct_proxy_dict(proxy) @@ -119,7 +122,8 @@ def fetch(self, url: str) -> Response: :param url: Target url. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ - if not self.stealth: + if not self.stealth or self.real_chrome: + # Because rebrowser_playwright doesn't play well with real browsers from playwright.sync_api import sync_playwright else: from rebrowser_playwright.sync_api import sync_playwright @@ -146,9 +150,11 @@ def fetch(self, url: str) -> Response: browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url) else: if self.stealth: - browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True) + browser = p.chromium.launch( + headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium' + ) else: - browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation']) + browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium') # Creating the context if self.stealth: From a8c576ad2850a3764bdc286b122ed626f0810da2 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 15:15:08 +0200 Subject: [PATCH 6/9] Adding the `real_chrome` option to the fetcher --- scrapling/fetchers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index 1569c31..f0de8dc 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -146,12 +146,12 @@ class PlayWrightFetcher(BaseFetcher): def fetch( self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None, useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000, - page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', - hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, + page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', + hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, proxy: Optional[Union[str, Dict[str, str]]] = None, - stealth: bool = False, + stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, cdp_url: Optional[str] = None, - nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None, + nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None, ) -> Response: """Opens up a browser and do your request based on your chosen options below. @@ -167,6 +167,7 @@ def fetch( :param wait_selector: Wait for a specific css selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`. :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently. + :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely. :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. @@ -184,6 +185,7 @@ def fetch( cdp_url=cdp_url, headless=headless, useragent=useragent, + real_chrome=real_chrome, page_action=page_action, hide_canvas=hide_canvas, network_idle=network_idle, From c1429074a79ef8a85e76bc15e486dc6d87137a83 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 15:15:58 +0200 Subject: [PATCH 7/9] Setting the compatible version of playwright with cdp patches --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 13f4a35..98fce34 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ "orjson>=3", "tldextract", 'httpx[brotli,zstd]', - 'playwright', + 'playwright==1.48', # Temporary because currently All libraries that provide CDP patches doesn't support playwright 1.49 yet 'rebrowser-playwright', 'camoufox>=0.3.10', 'browserforge', From 12d638ac2e4cc0e4f12528d389162e103b98f72e Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 15:20:18 +0200 Subject: [PATCH 8/9] Updating the doc to reflect new changes --- README.md | 5 ++++- scrapling/fetchers.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 028b41d..3320fd6 100644 --- a/README.md +++ b/README.md @@ -290,9 +290,11 @@ Using this Fetcher class, you can make requests with: * Mimics some of the real browsers' properties by injecting several JS files and using custom options. * Using custom flags on launch to hide Playwright even more and make it faster. * Generates real browser's headers of the same type and same user OS then append it to the request's headers. - 3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it. + 3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it. 4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option. +> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device + Add that to a lot of controlling/hiding options as you will see in the arguments list below.
Expand this for the complete list of arguments @@ -314,6 +316,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ | | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ | | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ | +| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ | | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ | | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ | | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ | diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index f0de8dc..bc47c8e 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -138,7 +138,7 @@ class PlayWrightFetcher(BaseFetcher): 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options. 3) Using custom flags on launch to hide Playwright even more and make it faster. 4) Generates real browser's headers of the same type and same user OS then append it to the request. - - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it. + - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it. - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option. > Note that these are the main options with PlayWright but it can be mixed together. From 438867e0be8e3aebd50cd6a9c2f8e2eea26446d0 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 24 Nov 2024 15:21:58 +0200 Subject: [PATCH 9/9] Pumping the version up to 0.2.6 --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index 028caef..33342f4 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -4,7 +4,7 @@ from scrapling.core.custom_types import TextHandler, AttributesHandler __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.5" +__version__ = "0.2.6" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index 9d059c8..be5aa12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.5 +version = 0.2.6 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index 98fce34..281f567 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.5", + version="0.2.6", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""",