Merge remote-tracking branch 'scrapy/master' into deprecation-removals

scrapy · Jan 27, 2025 · 569c515 · 569c515
2 parents 1e7dd0c + c2878b7
commit 569c515
Show file tree

Hide file tree

Showing 14 changed files with 112 additions and 146 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - python-version: "3.12"  # Keep in sync with .readthedocs.yml
+        - python-version: "3.13"  # Keep in sync with .readthedocs.yml
           env:
             TOXENV: docs
         - python-version: "3.13"

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -32,4 +32,4 @@ jobs:
         tox -e py
 
     - name: Upload coverage report
-      run: bash <(curl -s https://codecov.io/bash)
+      uses: codecov/codecov-action@v5
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 - repo: https://github.com/PyCQA/bandit
-  rev: 1.7.10
+  rev: 1.8.2
   hooks:
   - id: bandit
     args: [-r, -c, .bandit.yml]
@@ -16,3 +16,8 @@ repos:
   rev: 5.13.2
   hooks:
   - id: isort
+- repo: https://github.com/asottile/pyupgrade
+  rev: v3.19.1
+  hooks:
+  - id: pyupgrade
+    args: [--py39-plus]
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -4,11 +4,11 @@ sphinx:
   configuration: docs/conf.py
   fail_on_warning: true
 build:
-  os: ubuntu-22.04
+  os: ubuntu-24.04
   tools:
     # For available versions, see:
     # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
-    python: "3.12"  # Keep in sync with .github/workflows/build.yml
+    python: "3.13"  # Keep in sync with .github/workflows/build.yml
 python:
   install:
     - requirements: docs/requirements.txt

diff --git a/pylintrc b/pylintrc
@@ -2,25 +2,15 @@
 persistent=no
 
 [MESSAGES CONTROL]
-disable=bad-continuation,
-        bad-whitespace,
-        consider-using-in,
-        expression-not-assigned,
-        fixme,
-        implicit-str-concat,
+enable=useless-suppression
+disable=fixme,
         import-error,
         import-outside-toplevel,
-        inconsistent-return-statements,
         invalid-name,
-        len-as-condition,
         line-too-long,
         missing-class-docstring,
         missing-function-docstring,
         missing-module-docstring,
-        multiple-imports,
-        no-else-continue,
-        no-else-return,
-        no-self-use,
         raise-missing-from,
         redefined-builtin,
         redefined-outer-name,
@@ -29,14 +19,5 @@ disable=bad-continuation,
         too-many-lines,
         too-many-positional-arguments,
         too-many-public-methods,
-        trailing-comma-tuple,
-        trailing-newlines,
-        trailing-whitespace,
-        unidiomatic-typecheck,
-        unnecessary-lambda-assignment,
-        unreachable,
         unused-argument,
-        unused-variable,
-        useless-option-value,
-        wrong-import-order,
-        wrong-import-position
+        unused-variable
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -1,6 +1,8 @@
+from __future__ import annotations
+
 import codecs
 import unittest
-from typing import Any, List, Optional, Union
+from typing import Any
 
 from w3lib.encoding import (
     html_body_declared_encoding,
@@ -122,7 +124,7 @@ def test_invalid_utf8(self):
         self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")
 
 
-def ct(charset: Optional[str]) -> Optional[str]:
+def ct(charset: str | None) -> str | None:
     return "Content-Type: text/html; charset=" + charset if charset else None
 
 
@@ -141,10 +143,10 @@ def test_unicode_body(self):
 
     def _assert_encoding(
         self,
-        content_type: Optional[str],
+        content_type: str | None,
         body: bytes,
         expected_encoding: str,
-        expected_unicode: Union[str, List[str]],
+        expected_unicode: str | list[str],
     ) -> None:
         assert not isinstance(body, str)
         encoding, body_unicode = html_to_unicode(ct(content_type), body)
@@ -218,7 +220,7 @@ def test_replace_wrong_encoding(self):
 
     def _assert_encoding_detected(
         self,
-        content_type: Optional[str],
+        content_type: str | None,
         expected_encoding: str,
         body: bytes,
         **kwargs: Any,

diff --git a/tests/test_url.py b/tests/test_url.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 import os
 import sys
 import unittest
 from inspect import isclass
-from typing import Callable, List, Optional, Tuple, Type, Union
+from typing import Callable
 from urllib.parse import urlparse
 
 import pytest
@@ -35,9 +37,7 @@
 # input parameters.
 #
 # (encoding, input URL, output URL or exception)
-SAFE_URL_ENCODING_CASES: List[
-    Tuple[Optional[str], StrOrBytes, Union[str, Type[Exception]]]
-] = [
+SAFE_URL_ENCODING_CASES: list[tuple[str | None, StrOrBytes, str | type[Exception]]] = [
     (None, "", ValueError),
     (None, "https://example.com", "https://example.com"),
     (None, "https://example.com/©", "https://example.com/%C2%A9"),
@@ -319,8 +319,8 @@
 def _test_safe_url_func(
     url: StrOrBytes,
     *,
-    encoding: Optional[str] = None,
-    output: Union[str, Type[Exception]],
+    encoding: str | None = None,
+    output: str | type[Exception],
     func: Callable[..., str],
 ) -> None:
     kwargs = {}
@@ -338,8 +338,8 @@ def _test_safe_url_func(
 def _test_safe_url_string(
     url: StrOrBytes,
     *,
-    encoding: Optional[str] = None,
-    output: Union[str, Type[Exception]],
+    encoding: str | None = None,
+    output: str | type[Exception],
 ) -> None:
     return _test_safe_url_func(
         url,
@@ -373,7 +373,7 @@ def _test_safe_url_string(
     ),
 )
 def test_safe_url_string_encoding(
-    encoding: Optional[str], url: StrOrBytes, output: Union[str, Type[Exception]]
+    encoding: str | None, url: StrOrBytes, output: str | type[Exception]
 ) -> None:
     _test_safe_url_string(url, encoding=encoding, output=output)
 
@@ -439,9 +439,7 @@ def test_safe_url_string_encoding(
         for case in SAFE_URL_URL_CASES
     ),
 )
-def test_safe_url_string_url(
-    url: StrOrBytes, output: Union[str, Type[Exception]]
-) -> None:
+def test_safe_url_string_url(url: StrOrBytes, output: str | type[Exception]) -> None:
     _test_safe_url_string(url, output=output)
 
 
@@ -858,6 +856,7 @@ def test_url_query_parameter(self):
             url_query_parameter("product.html?id=", "id", keep_blank_values=1), ""
         )
 
+    @pytest.mark.xfail
     def test_url_query_parameter_2(self):
         """
         This problem was seen several times in the feeds. Sometime affiliate URLs contains
@@ -873,14 +872,14 @@ def test_url_query_parameter_2(self):
         and the URL extraction will fail, current workaround was made in the spider,
         just a replace for &#39; to %27
         """
-        return  # FIXME: this test should pass but currently doesnt
         # correct case
         aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
         aff_url2 = url_query_parameter(aff_url1, "url")
         self.assertEqual(
             aff_url2,
             "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1",
         )
+        assert aff_url2 is not None
         prod_url = url_query_parameter(aff_url2, "referredURL")
         self.assertEqual(
             prod_url,
@@ -893,6 +892,7 @@ def test_url_query_parameter_2(self):
             aff_url2,
             "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children&#39;s garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1",
         )
+        assert aff_url2 is not None
         prod_url = url_query_parameter(aff_url2, "referredURL")
         # fails, prod_url is None now
         self.assertEqual(
@@ -1574,7 +1574,7 @@ def test_mediatype_parameters(self):
         self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
 
     def test_base64(self):
-        result = parse_data_uri("data:text/plain;base64," "SGVsbG8sIHdvcmxkLg%3D%3D")
+        result = parse_data_uri("data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D")
         self.assertEqual(result.media_type, "text/plain")
         self.assertEqual(result.data, b"Hello, world.")
 
@@ -1587,7 +1587,7 @@ def test_base64_spaces(self):
         self.assertEqual(result.data, b"Hello, world.")
 
         result = parse_data_uri(
-            "data:text/plain;base64,SGVsb G8sIH\n  " "dvcm   xk Lg%3D\n%3D"
+            "data:text/plain;base64,SGVsb G8sIH\n  dvcm   xk Lg%3D\n%3D"
         )
         self.assertEqual(result.media_type, "text/plain")
         self.assertEqual(result.data, b"Hello, world.")

diff --git a/tox.ini b/tox.ini
@@ -21,14 +21,14 @@ basepython = python3
 deps =
     # mypy would error if pytest (or its stub) not found
     pytest
-    mypy==1.11.2
+    mypy==1.14.1
 commands =
     mypy --strict {posargs: w3lib tests}
 
 [testenv:pylint]
 deps =
     {[testenv]deps}
-    pylint==3.3.1
+    pylint==3.3.3
 commands =
     pylint conftest.py docs setup.py tests w3lib
 
@@ -46,8 +46,8 @@ skip_install = true
 [testenv:twinecheck]
 basepython = python3
 deps =
-    twine==5.1.1
-    build==1.2.2
+    twine==6.1.0
+    build==1.2.2.post1
 commands =
     python -m build --sdist
     twine check dist/*
diff --git a/w3lib/_types.py b/w3lib/_types.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing import Union
 
 # the base class UnicodeError doesn't have attributes like start / end

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -2,18 +2,21 @@
 Functions for handling encoding of web pages
 """
 
+from __future__ import annotations
+
 import codecs
 import encodings
 import re
-from typing import Callable, Match, Optional, Tuple, Union, cast
+from re import Match
+from typing import Callable, cast
 
 import w3lib.util
 from w3lib._types import AnyUnicodeError, StrOrBytes
 
 _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)
 
 
-def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
+def http_content_type_encoding(content_type: str | None) -> str | None:
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -49,7 +52,6 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
 _XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P<xmlcharset>[\w-]+)")
 
 # check for meta tags, or xml decl. and stop search if a body tag is encountered
-# pylint: disable=consider-using-f-string
 _BODY_ENCODING_PATTERN = (
     r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)"
     % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
@@ -60,7 +62,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
 )
 
 
-def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
+def html_body_declared_encoding(html_body_str: StrOrBytes) -> str | None:
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -84,7 +86,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
 
     # html5 suggests the first 1024 bytes are sufficient, we allow for more
     chunk = html_body_str[:4096]
-    match: Union[Optional[Match[bytes]], Optional[Match[str]]]
+    match: Match[bytes] | Match[str] | None
     if isinstance(chunk, bytes):
         match = _BODY_ENCODING_BYTES_RE.search(chunk)
     else:
@@ -140,7 +142,7 @@ def _c18n_encoding(encoding: str) -> str:
     return cast(str, encodings.aliases.aliases.get(normed, normed))
 
 
-def resolve_encoding(encoding_alias: str) -> Optional[str]:
+def resolve_encoding(encoding_alias: str) -> str | None:
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -170,7 +172,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]:
 _FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE}
 
 
-def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
+def read_bom(data: bytes) -> tuple[None, None] | tuple[str, bytes]:
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
@@ -216,11 +218,11 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
 
 
 def html_to_unicode(
-    content_type_header: Optional[str],
+    content_type_header: str | None,
     html_body_str: bytes,
     default_encoding: str = "utf8",
-    auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None,
-) -> Tuple[str, str]:
+    auto_detect_fun: Callable[[bytes], str | None] | None = None,
+) -> tuple[str, str]:
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -289,7 +291,7 @@ def html_to_unicode(
 
     enc = http_content_type_encoding(content_type_header)
     if enc is not None:
-        if enc == "utf-16" or enc == "utf-32":
+        if enc in {"utf-16", "utf-32"}:
             enc += "-be"
         return enc, to_unicode(html_body_str, enc)
     enc = html_body_declared_encoding(html_body_str)