diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 33ce9f4..a98b3dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: include: - - python-version: "3.12" # Keep in sync with .readthedocs.yml + - python-version: "3.13" # Keep in sync with .readthedocs.yml env: TOXENV: docs - python-version: "3.13" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9c60e48..7ce4b71 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,4 +32,4 @@ jobs: tox -e py - name: Upload coverage report - run: bash <(curl -s https://codecov.io/bash) + uses: codecov/codecov-action@v5 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ab99544..1a74856 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/PyCQA/bandit - rev: 1.7.10 + rev: 1.8.2 hooks: - id: bandit args: [-r, -c, .bandit.yml] @@ -16,3 +16,8 @@ repos: rev: 5.13.2 hooks: - id: isort +- repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + args: [--py39-plus] diff --git a/.readthedocs.yml b/.readthedocs.yml index 5ba0d2a..2b53eb1 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,11 +4,11 @@ sphinx: configuration: docs/conf.py fail_on_warning: true build: - os: ubuntu-22.04 + os: ubuntu-24.04 tools: # For available versions, see: # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python - python: "3.12" # Keep in sync with .github/workflows/build.yml + python: "3.13" # Keep in sync with .github/workflows/build.yml python: install: - requirements: docs/requirements.txt diff --git a/pylintrc b/pylintrc index 441c214..7943ad1 100644 --- a/pylintrc +++ b/pylintrc @@ -2,25 +2,15 @@ persistent=no [MESSAGES CONTROL] -disable=bad-continuation, - bad-whitespace, - consider-using-in, - expression-not-assigned, - fixme, - implicit-str-concat, +enable=useless-suppression +disable=fixme, import-error, import-outside-toplevel, - inconsistent-return-statements, invalid-name, - len-as-condition, line-too-long, missing-class-docstring, missing-function-docstring, missing-module-docstring, - multiple-imports, - no-else-continue, - no-else-return, - no-self-use, raise-missing-from, redefined-builtin, redefined-outer-name, @@ -29,14 +19,5 @@ disable=bad-continuation, too-many-lines, too-many-positional-arguments, too-many-public-methods, - trailing-comma-tuple, - trailing-newlines, - trailing-whitespace, - unidiomatic-typecheck, - unnecessary-lambda-assignment, - unreachable, unused-argument, - unused-variable, - useless-option-value, - wrong-import-order, - wrong-import-position + unused-variable diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 3a2f5c4..8911568 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import codecs import unittest -from typing import Any, List, Optional, Union +from typing import Any from w3lib.encoding import ( html_body_declared_encoding, @@ -122,7 +124,7 @@ def test_invalid_utf8(self): self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3") -def ct(charset: Optional[str]) -> Optional[str]: +def ct(charset: str | None) -> str | None: return "Content-Type: text/html; charset=" + charset if charset else None @@ -141,10 +143,10 @@ def test_unicode_body(self): def _assert_encoding( self, - content_type: Optional[str], + content_type: str | None, body: bytes, expected_encoding: str, - expected_unicode: Union[str, List[str]], + expected_unicode: str | list[str], ) -> None: assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body) @@ -218,7 +220,7 @@ def test_replace_wrong_encoding(self): def _assert_encoding_detected( self, - content_type: Optional[str], + content_type: str | None, expected_encoding: str, body: bytes, **kwargs: Any, diff --git a/tests/test_url.py b/tests/test_url.py index 8a7b02d..7444ab9 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import os import sys import unittest from inspect import isclass -from typing import Callable, List, Optional, Tuple, Type, Union +from typing import Callable from urllib.parse import urlparse import pytest @@ -35,9 +37,7 @@ # input parameters. # # (encoding, input URL, output URL or exception) -SAFE_URL_ENCODING_CASES: List[ - Tuple[Optional[str], StrOrBytes, Union[str, Type[Exception]]] -] = [ +SAFE_URL_ENCODING_CASES: list[tuple[str | None, StrOrBytes, str | type[Exception]]] = [ (None, "", ValueError), (None, "https://example.com", "https://example.com"), (None, "https://example.com/©", "https://example.com/%C2%A9"), @@ -319,8 +319,8 @@ def _test_safe_url_func( url: StrOrBytes, *, - encoding: Optional[str] = None, - output: Union[str, Type[Exception]], + encoding: str | None = None, + output: str | type[Exception], func: Callable[..., str], ) -> None: kwargs = {} @@ -338,8 +338,8 @@ def _test_safe_url_func( def _test_safe_url_string( url: StrOrBytes, *, - encoding: Optional[str] = None, - output: Union[str, Type[Exception]], + encoding: str | None = None, + output: str | type[Exception], ) -> None: return _test_safe_url_func( url, @@ -373,7 +373,7 @@ def _test_safe_url_string( ), ) def test_safe_url_string_encoding( - encoding: Optional[str], url: StrOrBytes, output: Union[str, Type[Exception]] + encoding: str | None, url: StrOrBytes, output: str | type[Exception] ) -> None: _test_safe_url_string(url, encoding=encoding, output=output) @@ -439,9 +439,7 @@ def test_safe_url_string_encoding( for case in SAFE_URL_URL_CASES ), ) -def test_safe_url_string_url( - url: StrOrBytes, output: Union[str, Type[Exception]] -) -> None: +def test_safe_url_string_url(url: StrOrBytes, output: str | type[Exception]) -> None: _test_safe_url_string(url, output=output) @@ -858,6 +856,7 @@ def test_url_query_parameter(self): url_query_parameter("product.html?id=", "id", keep_blank_values=1), "" ) + @pytest.mark.xfail def test_url_query_parameter_2(self): """ This problem was seen several times in the feeds. Sometime affiliate URLs contains @@ -873,7 +872,6 @@ def test_url_query_parameter_2(self): and the URL extraction will fail, current workaround was made in the spider, just a replace for ' to %27 """ - return # FIXME: this test should pass but currently doesnt # correct case aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1" aff_url2 = url_query_parameter(aff_url1, "url") @@ -881,6 +879,7 @@ def test_url_query_parameter_2(self): aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1", ) + assert aff_url2 is not None prod_url = url_query_parameter(aff_url2, "referredURL") self.assertEqual( prod_url, @@ -893,6 +892,7 @@ def test_url_query_parameter_2(self): aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1", ) + assert aff_url2 is not None prod_url = url_query_parameter(aff_url2, "referredURL") # fails, prod_url is None now self.assertEqual( @@ -1574,7 +1574,7 @@ def test_mediatype_parameters(self): self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e") def test_base64(self): - result = parse_data_uri("data:text/plain;base64," "SGVsbG8sIHdvcmxkLg%3D%3D") + result = parse_data_uri("data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D") self.assertEqual(result.media_type, "text/plain") self.assertEqual(result.data, b"Hello, world.") @@ -1587,7 +1587,7 @@ def test_base64_spaces(self): self.assertEqual(result.data, b"Hello, world.") result = parse_data_uri( - "data:text/plain;base64,SGVsb G8sIH\n " "dvcm xk Lg%3D\n%3D" + "data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D" ) self.assertEqual(result.media_type, "text/plain") self.assertEqual(result.data, b"Hello, world.") diff --git a/tox.ini b/tox.ini index ae68608..8116228 100644 --- a/tox.ini +++ b/tox.ini @@ -21,14 +21,14 @@ basepython = python3 deps = # mypy would error if pytest (or its stub) not found pytest - mypy==1.11.2 + mypy==1.14.1 commands = mypy --strict {posargs: w3lib tests} [testenv:pylint] deps = {[testenv]deps} - pylint==3.3.1 + pylint==3.3.3 commands = pylint conftest.py docs setup.py tests w3lib @@ -46,8 +46,8 @@ skip_install = true [testenv:twinecheck] basepython = python3 deps = - twine==5.1.1 - build==1.2.2 + twine==6.1.0 + build==1.2.2.post1 commands = python -m build --sdist twine check dist/* diff --git a/w3lib/_types.py b/w3lib/_types.py index 84499a6..90e590f 100644 --- a/w3lib/_types.py +++ b/w3lib/_types.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Union # the base class UnicodeError doesn't have attributes like start / end diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 8877c6f..1269c9d 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,10 +2,13 @@ Functions for handling encoding of web pages """ +from __future__ import annotations + import codecs import encodings import re -from typing import Callable, Match, Optional, Tuple, Union, cast +from re import Match +from typing import Callable, cast import w3lib.util from w3lib._types import AnyUnicodeError, StrOrBytes @@ -13,7 +16,7 @@ _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) -def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: +def http_content_type_encoding(content_type: str | None) -> str | None: """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -49,7 +52,6 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: _XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P[\w-]+)") # check for meta tags, or xml decl. and stop search if a body tag is encountered -# pylint: disable=consider-using-f-string _BODY_ENCODING_PATTERN = ( r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)" % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) @@ -60,7 +62,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: ) -def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: +def html_body_declared_encoding(html_body_str: StrOrBytes) -> str | None: '''Return the encoding specified in meta tags in the html body, or ``None`` if no suitable encoding was found @@ -84,7 +86,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: # html5 suggests the first 1024 bytes are sufficient, we allow for more chunk = html_body_str[:4096] - match: Union[Optional[Match[bytes]], Optional[Match[str]]] + match: Match[bytes] | Match[str] | None if isinstance(chunk, bytes): match = _BODY_ENCODING_BYTES_RE.search(chunk) else: @@ -140,7 +142,7 @@ def _c18n_encoding(encoding: str) -> str: return cast(str, encodings.aliases.aliases.get(normed, normed)) -def resolve_encoding(encoding_alias: str) -> Optional[str]: +def resolve_encoding(encoding_alias: str) -> str | None: """Return the encoding that `encoding_alias` maps to, or ``None`` if the encoding cannot be interpreted @@ -170,7 +172,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]: _FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE} -def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]: +def read_bom(data: bytes) -> tuple[None, None] | tuple[str, bytes]: r"""Read the byte order mark in the text, if present, and return the encoding represented by the BOM and the BOM. @@ -216,11 +218,11 @@ def to_unicode(data_str: bytes, encoding: str) -> str: def html_to_unicode( - content_type_header: Optional[str], + content_type_header: str | None, html_body_str: bytes, default_encoding: str = "utf8", - auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None, -) -> Tuple[str, str]: + auto_detect_fun: Callable[[bytes], str | None] | None = None, +) -> tuple[str, str]: r'''Convert raw html bytes to unicode This attempts to make a reasonable guess at the content encoding of the @@ -289,7 +291,7 @@ def html_to_unicode( enc = http_content_type_encoding(content_type_header) if enc is not None: - if enc == "utf-16" or enc == "utf-32": + if enc in {"utf-16", "utf-32"}: enc += "-be" return enc, to_unicode(html_body_str, enc) enc = html_body_declared_encoding(html_body_str) diff --git a/w3lib/html.py b/w3lib/html.py index 1a4bc21..8c78c86 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -2,9 +2,12 @@ Functions for dealing with markup text """ +from __future__ import annotations + import re +from collections.abc import Iterable from html.entities import name2codepoint -from typing import Iterable, Match, Optional, Pattern, Tuple, Union +from re import Match, Pattern from urllib.parse import urljoin from w3lib._types import StrOrBytes @@ -77,10 +80,9 @@ def convert_entity(m: Match[str]) -> str: entity_name = groups["named"] if entity_name.lower() in keep: return m.group(0) - else: - number = name2codepoint.get(entity_name) or name2codepoint.get( - entity_name.lower() - ) + number = name2codepoint.get(entity_name) or name2codepoint.get( + entity_name.lower() + ) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped @@ -89,8 +91,7 @@ def convert_entity(m: Match[str]) -> str: try: if 0x80 <= number <= 0x9F: return bytes((number,)).decode("cp1252") - else: - return chr(number) + return chr(number) except (ValueError, OverflowError): pass @@ -99,13 +100,11 @@ def convert_entity(m: Match[str]) -> str: return _ent_re.sub(convert_entity, to_unicode(text, encoding)) -def has_entities(text: StrOrBytes, encoding: Optional[str] = None) -> bool: +def has_entities(text: StrOrBytes, encoding: str | None = None) -> bool: return bool(_ent_re.search(to_unicode(text, encoding))) -def replace_tags( - text: StrOrBytes, token: str = "", encoding: Optional[str] = None -) -> str: +def replace_tags(text: StrOrBytes, token: str = "", encoding: str | None = None) -> str: """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. @@ -131,7 +130,7 @@ def replace_tags( _REMOVECOMMENTS_RE = re.compile("|$)", re.DOTALL) -def remove_comments(text: StrOrBytes, encoding: Optional[str] = None) -> str: +def remove_comments(text: StrOrBytes, encoding: str | None = None) -> str: """Remove HTML Comments. >>> import w3lib.html @@ -149,7 +148,7 @@ def remove_tags( text: StrOrBytes, which_ones: Iterable[str] = (), keep: Iterable[str] = (), - encoding: Optional[str] = None, + encoding: str | None = None, ) -> str: """Remove HTML Tags only. @@ -204,8 +203,7 @@ def will_remove(tag: str) -> bool: tag = tag.lower() if which_ones: return tag in which_ones - else: - return tag not in keep + return tag not in keep def remove_tag(m: Match[str]) -> str: tag = m.group(1) @@ -218,7 +216,7 @@ def remove_tag(m: Match[str]) -> str: def remove_tags_with_content( - text: StrOrBytes, which_ones: Iterable[str] = (), encoding: Optional[str] = None + text: StrOrBytes, which_ones: Iterable[str] = (), encoding: str | None = None ) -> str: """Remove tags and their content. @@ -245,7 +243,7 @@ def replace_escape_chars( text: StrOrBytes, which_ones: Iterable[str] = ("\n", "\t", "\r"), replace_by: StrOrBytes = "", - encoding: Optional[str] = None, + encoding: str | None = None, ) -> str: """Remove escape characters. @@ -267,7 +265,7 @@ def unquote_markup( text: StrOrBytes, keep: Iterable[str] = (), remove_illegal: bool = True, - encoding: Optional[str] = None, + encoding: str | None = None, ) -> str: """ This function receives markup as a text (always a unicode string or @@ -280,9 +278,7 @@ def unquote_markup( """ - def _get_fragments( - txt: str, pattern: Pattern[str] - ) -> Iterable[Union[str, Match[str]]]: + def _get_fragments(txt: str, pattern: Pattern[str]) -> Iterable[str | Match[str]]: offset = 0 for match in pattern.finditer(txt): match_s, match_e = match.span(1) @@ -316,13 +312,11 @@ def get_base_url( """ utext: str = remove_comments(text, encoding=encoding) - m = _baseurl_re.search(utext) - if m: + if m := _baseurl_re.search(utext): return urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) - else: - return safe_url_string(baseurl) + return safe_url_string(baseurl) def get_meta_refresh( @@ -330,7 +324,7 @@ def get_meta_refresh( baseurl: str = "", encoding: str = "utf-8", ignore_tags: Iterable[str] = ("script", "noscript"), -) -> Union[Tuple[None, None], Tuple[float, str]]: +) -> tuple[None, None] | tuple[float, str]: """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a @@ -347,14 +341,12 @@ def get_meta_refresh( raise utext = remove_tags_with_content(utext, ignore_tags) utext = remove_comments(replace_entities(utext)) - m = _meta_refresh_re.search(utext) or _meta_refresh_re2.search(utext) - if m: + if m := _meta_refresh_re.search(utext) or _meta_refresh_re2.search(utext): interval = float(m.group("int")) url = safe_url_string(m.group("url").strip(" \"'"), encoding) url = urljoin(baseurl, url) return interval, url - else: - return None, None + return None, None def strip_html5_whitespace(text: str) -> str: diff --git a/w3lib/http.py b/w3lib/http.py index 8409d86..17609a2 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,20 +1,14 @@ +from __future__ import annotations + from base64 import b64encode -from typing import ( - Any, - List, - Mapping, - MutableMapping, - Optional, - Sequence, - Union, - overload, -) +from collections.abc import Mapping, MutableMapping, Sequence +from typing import Any, Union, overload from w3lib._types import StrOrBytes from w3lib.util import to_bytes, to_unicode HeadersDictInput = Mapping[bytes, Union[Any, Sequence[bytes]]] -HeadersDictOutput = MutableMapping[bytes, List[bytes]] +HeadersDictOutput = MutableMapping[bytes, list[bytes]] @overload @@ -25,7 +19,7 @@ def headers_raw_to_dict(headers_raw: bytes) -> HeadersDictOutput: ... def headers_raw_to_dict(headers_raw: None) -> None: ... -def headers_raw_to_dict(headers_raw: Optional[bytes]) -> Optional[HeadersDictOutput]: +def headers_raw_to_dict(headers_raw: bytes | None) -> HeadersDictOutput | None: r""" Convert raw headers (single multi-line bytestring) to a dictionary. @@ -78,7 +72,7 @@ def headers_dict_to_raw(headers_dict: HeadersDictInput) -> bytes: ... def headers_dict_to_raw(headers_dict: None) -> None: ... -def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[bytes]: +def headers_dict_to_raw(headers_dict: HeadersDictInput | None) -> bytes | None: r""" Returns a raw HTTP headers representation of headers diff --git a/w3lib/url.py b/w3lib/url.py index c142048..da010d9 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -3,24 +3,16 @@ library. """ +from __future__ import annotations + import base64 import codecs import os import posixpath import re import string -from typing import ( - Callable, - Dict, - List, - NamedTuple, - Optional, - Sequence, - Tuple, - Union, - cast, - overload, -) +from collections.abc import Sequence +from typing import Callable, NamedTuple, cast, overload from urllib.parse import _coerce_args # type: ignore from urllib.parse import ( ParseResult, @@ -45,7 +37,7 @@ # error handling function for bytes-to-Unicode decoding errors with URLs -def _quote_byte(error: UnicodeError) -> Tuple[str, int]: +def _quote_byte(error: UnicodeError) -> tuple[str, int]: error = cast(AnyUnicodeError, error) return (to_unicode(quote(error.object[error.start : error.end])), error.end) @@ -227,8 +219,8 @@ def url_query_parameter( url: StrOrBytes, parameter: str, default: None = None, - keep_blank_values: Union[bool, int] = 0, -) -> Optional[str]: ... + keep_blank_values: bool | int = 0, +) -> str | None: ... @overload @@ -236,16 +228,16 @@ def url_query_parameter( url: StrOrBytes, parameter: str, default: str, - keep_blank_values: Union[bool, int] = 0, + keep_blank_values: bool | int = 0, ) -> str: ... def url_query_parameter( url: StrOrBytes, parameter: str, - default: Optional[str] = None, - keep_blank_values: Union[bool, int] = 0, -) -> Optional[str]: + default: str | None = None, + keep_blank_values: bool | int = 0, +) -> str | None: """Return the value of a url parameter, given the url and parameter name General case: @@ -279,13 +271,12 @@ def url_query_parameter( ) if parameter in queryparams: return queryparams[parameter][0] - else: - return default + return default def url_query_cleaner( url: StrOrBytes, - parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), + parameterlist: StrOrBytes | Sequence[StrOrBytes] = (), sep: str = "&", kvsep: str = "=", remove: bool = False, @@ -337,20 +328,19 @@ def url_query_cleaner( k, _, _ = ksv.partition(kvsep) if unique and k in seen: continue - elif remove and k in parameterlist: + if remove and k in parameterlist: continue - elif not remove and k not in parameterlist: + if not remove and k not in parameterlist: continue - else: - querylist.append(ksv) - seen.add(k) + querylist.append(ksv) + seen.add(k) url = "?".join([base, sep.join(querylist)]) if querylist else base if keep_fragments and fragment: url += "#" + fragment return url -def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: +def _add_or_replace_parameters(url: str, params: dict[str, str]) -> str: parsed = urlsplit(url) current_args = parse_qsl(parsed.query, keep_blank_values=True) @@ -388,7 +378,7 @@ def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: return _add_or_replace_parameters(url, {name: new_value}) -def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: +def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str: """Add or remove a parameters to a given url >>> import w3lib.url @@ -433,7 +423,6 @@ def any_to_uri(uri_or_path: str) -> str: _char = set(map(chr, range(127))) # RFC 2045 token. -# pylint: disable=consider-using-f-string _token = r"[{}]+".format( re.escape( "".join( @@ -449,7 +438,6 @@ def any_to_uri(uri_or_path: str) -> str: ) # RFC 822 quoted-string, without surrounding quotation marks. -# pylint: disable=consider-using-f-string _quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) ) @@ -473,7 +461,7 @@ class ParseDataURIResult(NamedTuple): #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). media_type: str #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). - media_type_parameters: Dict[str, str] + media_type_parameters: dict[str, str] #: Data, decoded if it was encoded in base64 format. data: bytes @@ -550,7 +538,7 @@ def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: def _safe_ParseResult( parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" -) -> Tuple[str, str, str, str, str, str]: +) -> tuple[str, str, str, str, str, str]: # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: @@ -569,10 +557,10 @@ def _safe_ParseResult( def canonicalize_url( - url: Union[StrOrBytes, ParseResult], + url: StrOrBytes | ParseResult, keep_blank_values: bool = True, keep_fragments: bool = False, - encoding: Optional[str] = None, + encoding: str | None = None, ) -> str: r"""Canonicalize the given url by applying the following procedures: @@ -676,7 +664,7 @@ def _unquotepath(path: str) -> bytes: def parse_url( - url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None + url: StrOrBytes | ParseResult, encoding: str | None = None ) -> ParseResult: """Return urlparsed url from the given argument (which could be an already parsed url) @@ -688,7 +676,7 @@ def parse_url( def parse_qsl_to_bytes( qs: str, keep_blank_values: bool = False -) -> List[Tuple[bytes, bytes]]: +) -> list[tuple[bytes, bytes]]: """Parse a query given as a string argument. Data are returned as a list of name, value pairs as bytes. @@ -708,7 +696,7 @@ def parse_qsl_to_bytes( # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) # except for the unquote(s, encoding, errors) calls replaced # with unquote_to_bytes(s) - coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args) + coerce_args = cast(Callable[..., tuple[str, Callable[..., bytes]]], _coerce_args) qs, _coerce_result = coerce_args(qs) pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] r = [] diff --git a/w3lib/util.py b/w3lib/util.py index 8fc1c62..2ca8f33 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,10 +1,10 @@ -from typing import Optional +from __future__ import annotations from w3lib._types import StrOrBytes def to_unicode( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" + text: StrOrBytes, encoding: str | None = None, errors: str = "strict" ) -> str: """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" @@ -20,7 +20,7 @@ def to_unicode( def to_bytes( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" + text: StrOrBytes, encoding: str | None = None, errors: str = "strict" ) -> bytes: """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is."""