Skip to content

Commit

Permalink
Use faster json parser & remove charset-normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans authored Mar 6, 2025
1 parent 9926618 commit c9bd280
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 164 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ repos:
- aiohttp-socks<0.11
- attrs
- certifi
- charset-normalizer<4
- maxminddb<3
- orjson<4
- platformdirs<5
- rich<14
- tomli<3
Expand Down
18 changes: 5 additions & 13 deletions proxy_scraper_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,11 @@
from typing import TYPE_CHECKING

import rich
from aiohttp import ClientSession, TCPConnector
from aiohttp import ClientSession, ClientTimeout, TCPConnector
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
from rich.table import Table

from proxy_scraper_checker import (
checker,
geodb,
http,
output,
scraper,
sort,
utils,
)
from proxy_scraper_checker import checker, geodb, http, output, scraper, sort
from proxy_scraper_checker.settings import Settings
from proxy_scraper_checker.storage import ProxyStorage

Expand Down Expand Up @@ -99,8 +91,8 @@ def get_summary_table(

async def main() -> None:
config = tomllib.loads(
utils.bytes_decode(
await asyncio.to_thread(Path("config.toml").read_bytes)
await asyncio.to_thread(
Path("config.toml").read_text, encoding="utf-8", errors="replace"
)
)
if config["debug"]:
Expand All @@ -112,7 +104,7 @@ async def main() -> None:
headers=http.HEADERS,
cookie_jar=http.get_cookie_jar(),
raise_for_status=True,
fallback_charset_resolver=http.fallback_charset_resolver,
timeout=ClientTimeout(total=60, connect=5),
) as session:
settings = await Settings.from_mapping(config, session=session)
storage = ProxyStorage(protocols=settings.sources)
Expand Down
27 changes: 1 addition & 26 deletions proxy_scraper_checker/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,15 @@
import ssl
from functools import cache
from types import MappingProxyType
from typing import TYPE_CHECKING

import certifi
from aiohttp import DummyCookieJar, hdrs

from proxy_scraper_checker.utils import bytes_decode

if TYPE_CHECKING:
from typing import NoReturn

from aiohttp import ClientResponse

HEADERS: MappingProxyType[str, str] = MappingProxyType({
hdrs.USER_AGENT: (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36" # noqa: E501
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" # noqa: E501
)
})
SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())
SSL_CONTEXT.set_alpn_protocols(("http/1.1",))


class NoCharsetHeaderError(Exception):
pass


def fallback_charset_resolver(_r: ClientResponse, _b: bytes) -> NoReturn:
raise NoCharsetHeaderError


get_cookie_jar = cache(DummyCookieJar)


def get_response_text(*, response: ClientResponse, content: bytes) -> str:
try:
return content.decode(response.get_encoding())
except (NoCharsetHeaderError, UnicodeDecodeError):
return bytes_decode(content)
23 changes: 11 additions & 12 deletions proxy_scraper_checker/output.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from __future__ import annotations

import asyncio
import json
import logging
import stat
from shutil import rmtree
from typing import TYPE_CHECKING

import maxminddb
import orjson

from proxy_scraper_checker import fs, sort
from proxy_scraper_checker.geodb import GEODB_PATH
Expand Down Expand Up @@ -63,19 +63,18 @@ async def save_proxies(*, settings: Settings, storage: ProxyStorage) -> None:
}
for proxy in sorted(storage, key=sort.timeout_sort_key)
]
for path, indent, separators in (
(settings.output_path / "proxies.json", None, (",", ":")),
(settings.output_path / "proxies_pretty.json", "\t", None),
for path, orjson_option in (
(settings.output_path / "proxies.json", None),
(
settings.output_path / "proxies_pretty.json",
orjson.OPT_INDENT_2,
),
):
await asyncio.to_thread(path.unlink, missing_ok=True)
f = await asyncio.to_thread(path.open, "w", encoding="utf-8")
try:
for chunk in json.JSONEncoder(
ensure_ascii=False, indent=indent, separators=separators
).iterencode(proxy_dicts):
await asyncio.to_thread(f.write, chunk)
finally:
await asyncio.to_thread(f.close)
await asyncio.to_thread(
path.write_bytes,
orjson.dumps(proxy_dicts, option=orjson_option),
)
finally:
if mmdb is not None:
await asyncio.to_thread(mmdb.close)
Expand Down
18 changes: 4 additions & 14 deletions proxy_scraper_checker/proxy.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
from __future__ import annotations

import json
from io import StringIO
from time import perf_counter
from typing import TYPE_CHECKING

import attrs
import orjson
from aiohttp import ClientSession
from aiohttp_socks import ProxyConnector

from proxy_scraper_checker.http import (
HEADERS,
SSL_CONTEXT,
fallback_charset_resolver,
get_cookie_jar,
get_response_text,
)
from proxy_scraper_checker.http import HEADERS, SSL_CONTEXT, get_cookie_jar
from proxy_scraper_checker.parsers import parse_ipv4
from proxy_scraper_checker.settings import CheckWebsiteType

Expand Down Expand Up @@ -63,7 +57,6 @@ async def check(self, *, settings: Settings) -> None:
cookie_jar=get_cookie_jar(),
raise_for_status=True,
timeout=settings.timeout,
fallback_charset_resolver=fallback_charset_resolver,
) as session,
session.get(
settings.check_website,
Expand All @@ -73,13 +66,10 @@ async def check(self, *, settings: Settings) -> None:
content = await response.read()
self.timeout = perf_counter() - start
if settings.check_website_type == CheckWebsiteType.HTTPBIN_IP:
r = json.loads(
get_response_text(response=response, content=content)
)
self.exit_ip = parse_ipv4(r["origin"])
self.exit_ip = parse_ipv4(orjson.loads(content)["origin"])
elif settings.check_website_type == CheckWebsiteType.PLAIN_IP:
self.exit_ip = parse_ipv4(
get_response_text(response=response, content=content)
content.decode(response.get_encoding(), errors="replace")
)
else:
self.exit_ip = None
Expand Down
14 changes: 7 additions & 7 deletions proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
from aiohttp import ClientResponseError, ClientTimeout
from aiohttp_socks import ProxyType

from proxy_scraper_checker.http import get_response_text
from proxy_scraper_checker.incrementor import Incrementor
from proxy_scraper_checker.parsers import PROXY_REGEX
from proxy_scraper_checker.proxy import Proxy
from proxy_scraper_checker.utils import bytes_decode, is_http_url
from proxy_scraper_checker.utils import is_http_url

if TYPE_CHECKING:
from aiohttp import ClientSession
Expand Down Expand Up @@ -40,12 +39,13 @@ async def scrape_one(
if is_http_url(source):
async with session.get(source, timeout=timeout) as response:
content = await response.read()
text = get_response_text(response=response, content=content)
text = content.decode(response.get_encoding(), errors="replace")
else:
content = await asyncio.to_thread(
Path(source.removeprefix("file://")).read_bytes
text = await asyncio.to_thread(
Path(source.removeprefix("file://")).read_text,
encoding="utf-8",
errors="replace",
)
text = bytes_decode(content)
except ClientResponseError as e:
_logger.warning(
"%s | HTTP status code %d: %s", source, e.status, e.message
Expand Down Expand Up @@ -112,7 +112,7 @@ async def scrape_all(
)
for proto, sources in settings.sources.items()
}
timeout = ClientTimeout(total=settings.source_timeout)
timeout = ClientTimeout(total=settings.source_timeout, connect=5)
await asyncio.gather(
*(
scrape_one(
Expand Down
12 changes: 6 additions & 6 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import asyncio
import enum
import json
import logging
import math
import stat
Expand All @@ -14,12 +13,12 @@
from urllib.parse import urlparse

import attrs
import orjson
import platformdirs
from aiohttp import ClientTimeout, hdrs
from aiohttp_socks import ProxyType

from proxy_scraper_checker import fs, sort
from proxy_scraper_checker.http import get_response_text
from proxy_scraper_checker.null_context import NullContext
from proxy_scraper_checker.parsers import parse_ipv4
from proxy_scraper_checker.utils import is_docker
Expand Down Expand Up @@ -148,18 +147,19 @@ async def _get_check_website_type_and_real_ip(
try:
async with session.get(check_website) as response:
content = await response.read()
text = get_response_text(response=response, content=content)
except Exception:
_logger.exception(
"Error when opening check_website without proxy, it will be "
"impossible to determine anonymity and geolocation of proxies"
)
return CheckWebsiteType.UNKNOWN, None
try:
js = json.loads(text)
except json.JSONDecodeError:
js = orjson.loads(content)
except orjson.JSONDecodeError:
try:
return CheckWebsiteType.PLAIN_IP, parse_ipv4(text)
return CheckWebsiteType.PLAIN_IP, parse_ipv4(
content.decode(response.get_encoding(), errors="replace")
)
except ValueError:
pass
else:
Expand Down
6 changes: 0 additions & 6 deletions proxy_scraper_checker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,9 @@
from pathlib import Path
from urllib.parse import urlparse

import charset_normalizer

is_docker = cache(Path("/.dockerenv").exists)


def is_http_url(value: str, /) -> bool:
parsed_url = urlparse(value)
return bool(parsed_url.scheme in {"http", "https"} and parsed_url.netloc)


def bytes_decode(value: bytes, /) -> str:
return str(charset_normalizer.from_bytes(value)[0])
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ requires-python = ">=3.9,<4"
classifiers = ["Private :: Do Not Upload"]
dependencies = [
"aiodns>=1.1,<4",
"aiohttp>=3.8.6,<4",
"aiohttp>=3.7,<4",
"aiohttp-socks>=0.7,<0.11",
"attrs>=22.2",
"brotli>=1,<2; implementation_name=='cpython'",
"brotlicffi<2; implementation_name!='cpython'",
"certifi",
"charset-normalizer>=2,<4",
"maxminddb>=1.3,<3",
"orjson>=2.6,<4",
"platformdirs<5",
"rich>=12.3,<14",
"tomli<3; python_version<'3.11'",
Expand Down
Loading

0 comments on commit c9bd280

Please sign in to comment.