Skip to content

Commit

Permalink
Merge remote-tracking branch 'scrapy/master' into deprecation-removals
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jan 27, 2025
2 parents 1e7dd0c + c2878b7 commit 569c515
Show file tree
Hide file tree
Showing 14 changed files with 112 additions and 146 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: "3.12" # Keep in sync with .readthedocs.yml
- python-version: "3.13" # Keep in sync with .readthedocs.yml
env:
TOXENV: docs
- python-version: "3.13"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
tox -e py
- name: Upload coverage report
run: bash <(curl -s https://codecov.io/bash)
uses: codecov/codecov-action@v5
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/PyCQA/bandit
rev: 1.7.10
rev: 1.8.2
hooks:
- id: bandit
args: [-r, -c, .bandit.yml]
Expand All @@ -16,3 +16,8 @@ repos:
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/asottile/pyupgrade
rev: v3.19.1
hooks:
- id: pyupgrade
args: [--py39-plus]
4 changes: 2 additions & 2 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ sphinx:
configuration: docs/conf.py
fail_on_warning: true
build:
os: ubuntu-22.04
os: ubuntu-24.04
tools:
# For available versions, see:
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
python: "3.12" # Keep in sync with .github/workflows/build.yml
python: "3.13" # Keep in sync with .github/workflows/build.yml
python:
install:
- requirements: docs/requirements.txt
Expand Down
25 changes: 3 additions & 22 deletions pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,15 @@
persistent=no

[MESSAGES CONTROL]
disable=bad-continuation,
bad-whitespace,
consider-using-in,
expression-not-assigned,
fixme,
implicit-str-concat,
enable=useless-suppression
disable=fixme,
import-error,
import-outside-toplevel,
inconsistent-return-statements,
invalid-name,
len-as-condition,
line-too-long,
missing-class-docstring,
missing-function-docstring,
missing-module-docstring,
multiple-imports,
no-else-continue,
no-else-return,
no-self-use,
raise-missing-from,
redefined-builtin,
redefined-outer-name,
Expand All @@ -29,14 +19,5 @@ disable=bad-continuation,
too-many-lines,
too-many-positional-arguments,
too-many-public-methods,
trailing-comma-tuple,
trailing-newlines,
trailing-whitespace,
unidiomatic-typecheck,
unnecessary-lambda-assignment,
unreachable,
unused-argument,
unused-variable,
useless-option-value,
wrong-import-order,
wrong-import-position
unused-variable
12 changes: 7 additions & 5 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import codecs
import unittest
from typing import Any, List, Optional, Union
from typing import Any

from w3lib.encoding import (
html_body_declared_encoding,
Expand Down Expand Up @@ -122,7 +124,7 @@ def test_invalid_utf8(self):
self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")


def ct(charset: Optional[str]) -> Optional[str]:
def ct(charset: str | None) -> str | None:
return "Content-Type: text/html; charset=" + charset if charset else None


Expand All @@ -141,10 +143,10 @@ def test_unicode_body(self):

def _assert_encoding(
self,
content_type: Optional[str],
content_type: str | None,
body: bytes,
expected_encoding: str,
expected_unicode: Union[str, List[str]],
expected_unicode: str | list[str],
) -> None:
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body)
Expand Down Expand Up @@ -218,7 +220,7 @@ def test_replace_wrong_encoding(self):

def _assert_encoding_detected(
self,
content_type: Optional[str],
content_type: str | None,
expected_encoding: str,
body: bytes,
**kwargs: Any,
Expand Down
30 changes: 15 additions & 15 deletions tests/test_url.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import os
import sys
import unittest
from inspect import isclass
from typing import Callable, List, Optional, Tuple, Type, Union
from typing import Callable
from urllib.parse import urlparse

import pytest
Expand Down Expand Up @@ -35,9 +37,7 @@
# input parameters.
#
# (encoding, input URL, output URL or exception)
SAFE_URL_ENCODING_CASES: List[
Tuple[Optional[str], StrOrBytes, Union[str, Type[Exception]]]
] = [
SAFE_URL_ENCODING_CASES: list[tuple[str | None, StrOrBytes, str | type[Exception]]] = [
(None, "", ValueError),
(None, "https://example.com", "https://example.com"),
(None, "https://example.com/©", "https://example.com/%C2%A9"),
Expand Down Expand Up @@ -319,8 +319,8 @@
def _test_safe_url_func(
url: StrOrBytes,
*,
encoding: Optional[str] = None,
output: Union[str, Type[Exception]],
encoding: str | None = None,
output: str | type[Exception],
func: Callable[..., str],
) -> None:
kwargs = {}
Expand All @@ -338,8 +338,8 @@ def _test_safe_url_func(
def _test_safe_url_string(
url: StrOrBytes,
*,
encoding: Optional[str] = None,
output: Union[str, Type[Exception]],
encoding: str | None = None,
output: str | type[Exception],
) -> None:
return _test_safe_url_func(
url,
Expand Down Expand Up @@ -373,7 +373,7 @@ def _test_safe_url_string(
),
)
def test_safe_url_string_encoding(
encoding: Optional[str], url: StrOrBytes, output: Union[str, Type[Exception]]
encoding: str | None, url: StrOrBytes, output: str | type[Exception]
) -> None:
_test_safe_url_string(url, encoding=encoding, output=output)

Expand Down Expand Up @@ -439,9 +439,7 @@ def test_safe_url_string_encoding(
for case in SAFE_URL_URL_CASES
),
)
def test_safe_url_string_url(
url: StrOrBytes, output: Union[str, Type[Exception]]
) -> None:
def test_safe_url_string_url(url: StrOrBytes, output: str | type[Exception]) -> None:
_test_safe_url_string(url, output=output)


Expand Down Expand Up @@ -858,6 +856,7 @@ def test_url_query_parameter(self):
url_query_parameter("product.html?id=", "id", keep_blank_values=1), ""
)

@pytest.mark.xfail
def test_url_query_parameter_2(self):
"""
This problem was seen several times in the feeds. Sometime affiliate URLs contains
Expand All @@ -873,14 +872,14 @@ def test_url_query_parameter_2(self):
and the URL extraction will fail, current workaround was made in the spider,
just a replace for &#39; to %27
"""
return # FIXME: this test should pass but currently doesnt
# correct case
aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
aff_url2 = url_query_parameter(aff_url1, "url")
self.assertEqual(
aff_url2,
"http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1",
)
assert aff_url2 is not None
prod_url = url_query_parameter(aff_url2, "referredURL")
self.assertEqual(
prod_url,
Expand All @@ -893,6 +892,7 @@ def test_url_query_parameter_2(self):
aff_url2,
"http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children&#39;s garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1",
)
assert aff_url2 is not None
prod_url = url_query_parameter(aff_url2, "referredURL")
# fails, prod_url is None now
self.assertEqual(
Expand Down Expand Up @@ -1574,7 +1574,7 @@ def test_mediatype_parameters(self):
self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")

def test_base64(self):
result = parse_data_uri("data:text/plain;base64," "SGVsbG8sIHdvcmxkLg%3D%3D")
result = parse_data_uri("data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D")
self.assertEqual(result.media_type, "text/plain")
self.assertEqual(result.data, b"Hello, world.")

Expand All @@ -1587,7 +1587,7 @@ def test_base64_spaces(self):
self.assertEqual(result.data, b"Hello, world.")

result = parse_data_uri(
"data:text/plain;base64,SGVsb G8sIH\n " "dvcm xk Lg%3D\n%3D"
"data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D"
)
self.assertEqual(result.media_type, "text/plain")
self.assertEqual(result.data, b"Hello, world.")
Expand Down
8 changes: 4 additions & 4 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ basepython = python3
deps =
# mypy would error if pytest (or its stub) not found
pytest
mypy==1.11.2
mypy==1.14.1
commands =
mypy --strict {posargs: w3lib tests}

[testenv:pylint]
deps =
{[testenv]deps}
pylint==3.3.1
pylint==3.3.3
commands =
pylint conftest.py docs setup.py tests w3lib

Expand All @@ -46,8 +46,8 @@ skip_install = true
[testenv:twinecheck]
basepython = python3
deps =
twine==5.1.1
build==1.2.2
twine==6.1.0
build==1.2.2.post1
commands =
python -m build --sdist
twine check dist/*
2 changes: 2 additions & 0 deletions w3lib/_types.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from typing import Union

# the base class UnicodeError doesn't have attributes like start / end
Expand Down
24 changes: 13 additions & 11 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@
Functions for handling encoding of web pages
"""

from __future__ import annotations

import codecs
import encodings
import re
from typing import Callable, Match, Optional, Tuple, Union, cast
from re import Match
from typing import Callable, cast

import w3lib.util
from w3lib._types import AnyUnicodeError, StrOrBytes

_HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)


def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
def http_content_type_encoding(content_type: str | None) -> str | None:
"""Extract the encoding in the content-type header
>>> import w3lib.encoding
Expand Down Expand Up @@ -49,7 +52,6 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
_XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P<xmlcharset>[\w-]+)")

# check for meta tags, or xml decl. and stop search if a body tag is encountered
# pylint: disable=consider-using-f-string
_BODY_ENCODING_PATTERN = (
r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)"
% (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
Expand All @@ -60,7 +62,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
)


def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
def html_body_declared_encoding(html_body_str: StrOrBytes) -> str | None:
'''Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found
Expand All @@ -84,7 +86,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:

# html5 suggests the first 1024 bytes are sufficient, we allow for more
chunk = html_body_str[:4096]
match: Union[Optional[Match[bytes]], Optional[Match[str]]]
match: Match[bytes] | Match[str] | None
if isinstance(chunk, bytes):
match = _BODY_ENCODING_BYTES_RE.search(chunk)
else:
Expand Down Expand Up @@ -140,7 +142,7 @@ def _c18n_encoding(encoding: str) -> str:
return cast(str, encodings.aliases.aliases.get(normed, normed))


def resolve_encoding(encoding_alias: str) -> Optional[str]:
def resolve_encoding(encoding_alias: str) -> str | None:
"""Return the encoding that `encoding_alias` maps to, or ``None``
if the encoding cannot be interpreted
Expand Down Expand Up @@ -170,7 +172,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]:
_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE}


def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
def read_bom(data: bytes) -> tuple[None, None] | tuple[str, bytes]:
r"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.
Expand Down Expand Up @@ -216,11 +218,11 @@ def to_unicode(data_str: bytes, encoding: str) -> str:


def html_to_unicode(
content_type_header: Optional[str],
content_type_header: str | None,
html_body_str: bytes,
default_encoding: str = "utf8",
auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None,
) -> Tuple[str, str]:
auto_detect_fun: Callable[[bytes], str | None] | None = None,
) -> tuple[str, str]:
r'''Convert raw html bytes to unicode
This attempts to make a reasonable guess at the content encoding of the
Expand Down Expand Up @@ -289,7 +291,7 @@ def html_to_unicode(

enc = http_content_type_encoding(content_type_header)
if enc is not None:
if enc == "utf-16" or enc == "utf-32":
if enc in {"utf-16", "utf-32"}:
enc += "-be"
return enc, to_unicode(html_body_str, enc)
enc = html_body_declared_encoding(html_body_str)
Expand Down
Loading

0 comments on commit 569c515

Please sign in to comment.