diff --git a/.github/banner.svg b/.github/banner.svg index 35dc93eaea42..ea7f9e306e9a 100644 --- a/.github/banner.svg +++ b/.github/banner.svg @@ -1,4 +1,4 @@ - + - - - - + + + + diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dcbb8c501afe..5285923e71d3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -125,11 +125,12 @@ jobs: sudo apt -y install zip pandoc man sed cat > ./requirements.txt << EOF python=3.10.* + pyinstaller brotli-python EOF python devscripts/install_deps.py --print \ --exclude brotli --exclude brotlicffi \ - --include secretstorage --include pyinstaller >> ./requirements.txt + --include secretstorage >> ./requirements.txt mamba create -n build --file ./requirements.txt - name: Prepare @@ -250,6 +251,22 @@ jobs: python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --user --no-binary :all: -r requirements.txt + # We need to fuse our own universal2 wheels for curl_cffi + python3 -m pip install -U --user delocate + mkdir curl_cffi_whls curl_cffi_universal2 + python3 devscripts/install_deps.py --print -o --include curl_cffi > requirements.txt + for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do + python3 -m pip download \ + --only-binary=:all: \ + --platform "${platform}" \ + --pre -d curl_cffi_whls \ + -r requirements.txt + done + python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2 + python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2 + cd curl_cffi_universal2 + for wheel in *cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done + python3 -m pip install -U --user *cffi*.whl - name: Prepare run: | @@ -303,7 +320,7 @@ jobs: run: | brew install coreutils python3 devscripts/install_deps.py --user -o --include build - python3 devscripts/install_deps.py --user --include pyinstaller + python3 devscripts/install_deps.py --user --include pyinstaller --include curl_cffi - name: Prepare run: | @@ -345,7 +362,7 @@ jobs: - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build - python devscripts/install_deps.py --include py2exe + python devscripts/install_deps.py --include py2exe --include curl_cffi python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" - name: Prepare @@ -450,8 +467,9 @@ jobs: - name: Make SHA2-SUMS files run: | cd ./artifact/ - sha256sum * > ../SHA2-256SUMS - sha512sum * > ../SHA2-512SUMS + # make sure SHA sums are also printed to stdout + sha256sum * | tee ../SHA2-256SUMS + sha512sum * | tee ../SHA2-512SUMS - name: Make Update spec run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index ba8630630c5b..076f785bf0de 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -53,7 +53,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include dev + run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi - name: Run tests continue-on-error: False run: | diff --git a/README.md b/README.md index 1e108a29c225..d4dd2c7be591 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ -yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on the now inactive [youtube-dlc](https://github.com/blackjack4494/yt-dlc). The main focus of this project is adding new features and patches while also keeping up to date with the original project +yt-dlp is a feature-rich command-line audio/video downloader with support for [thousands of sites](supportedsites.md). The project is a fork of [youtube-dl](https://github.com/ytdl-org/youtube-dl) based on the now inactive [youtube-dlc](https://github.com/blackjack4494/yt-dlc). @@ -196,6 +196,15 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) * [**requests**](https://github.com/psf/requests)\* - HTTP library. For HTTPS proxy and persistent connections support. Licensed under [Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) +#### Impersonation + +The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. + +* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) + * Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` + * Only included in `yt-dlp.exe`, `yt-dlp_macos` and `yt-dlp_macos_legacy` builds + + ### Metadata * [**mutagen**](https://github.com/quodlibet/mutagen)\* - For `--embed-thumbnail` in certain formats. Licensed under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) @@ -389,6 +398,10 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to + --impersonate CLIENT[:OS] Client to impersonate for requests. E.g. + chrome, chrome-110, chrome:windows-10. Pass + --impersonate="" to impersonate any client. + --list-impersonate-targets List available clients to impersonate. -4, --force-ipv4 Make all connections via IPv4 -6, --force-ipv6 Make all connections via IPv6 --enable-file-urls Enable file:// URLs. This is disabled by @@ -468,6 +481,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --max-downloads NUMBER Abort after downloading NUMBER files --break-on-existing Stop the download process when encountering a file that is in the archive + --no-break-on-existing Do not stop the download process when + encountering a file that is in the archive + (default) --break-per-input Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL @@ -1459,9 +1475,9 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `width`: Width of the video, if known - `height`: Height of the video, if known - `aspect_ratio`: Aspect ratio of the video, if known - - `tbr`: Average bitrate of audio and video in KBit/s - - `abr`: Average audio bitrate in KBit/s - - `vbr`: Average video bitrate in KBit/s + - `tbr`: Average bitrate of audio and video in [kbps](## "1000 bits/sec") + - `abr`: Average audio bitrate in [kbps](## "1000 bits/sec") + - `vbr`: Average video bitrate in [kbps](## "1000 bits/sec") - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate - `audio_channels`: The number of audio channels @@ -1486,7 +1502,7 @@ Any string comparison may be prefixed with negation `!` in order to produce an o **Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. -Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. +Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 kbps. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. Format selectors can also be grouped using parentheses; e.g. `-f "(mp4,webm)[height<480]"` will download the best pre-merged mp4 and webm formats with a height lower than 480. @@ -1518,10 +1534,10 @@ The available fields are: - `fps`: Framerate of video - `hdr`: The dynamic range of the video (`DV` > `HDR12` > `HDR10+` > `HDR10` > `HLG` > `SDR`) - `channels`: The number of audio channels - - `tbr`: Total average bitrate in KBit/s - - `vbr`: Average video bitrate in KBit/s - - `abr`: Average audio bitrate in KBit/s - - `br`: Average bitrate in KBit/s, `tbr`/`vbr`/`abr` + - `tbr`: Total average bitrate in [kbps](## "1000 bits/sec") + - `vbr`: Average video bitrate in [kbps](## "1000 bits/sec") + - `abr`: Average audio bitrate in [kbps](## "1000 bits/sec") + - `br`: Average bitrate in [kbps](## "1000 bits/sec"), `tbr`/`vbr`/`abr` - `asr`: Audio sample rate in Hz **Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. @@ -1792,9 +1808,12 @@ The following extractors use this feature: * `max_comments`: Maximum number of comments to extract - default is `120` #### tiktok -* `api_hostname`: Hostname to use for mobile API requests, e.g. `api-h2.tiktokv.com` -* `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`, e.g. `20.2.1` -* `manifest_app_version`: Numeric app version to call mobile APIs with, e.g. `221` +* `api_hostname`: Hostname to use for mobile API calls, e.g. `api22-normal-c-alisg.tiktokv.com` +* `app_name`: Default app name to use with mobile API calls, e.g. `trill` +* `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2` +* `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020` +* `aid`: Default app ID to use with API calls, e.g. `1180` +* `app_info`: One or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` diff --git a/bundle/py2exe.py b/bundle/py2exe.py index ccb52eaa20be..2811674925b0 100755 --- a/bundle/py2exe.py +++ b/bundle/py2exe.py @@ -28,7 +28,7 @@ def main(): }], version_info={ 'version': VERSION, - 'description': 'A youtube-dl fork with additional features and patches', + 'description': 'A feature-rich command-line audio/video downloader', 'comments': 'Official repository: ', 'product_name': 'yt-dlp', 'product_version': VERSION, diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 2a34ad0712d1..eaa348cf2e02 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -126,5 +126,9 @@ "when": "4ce57d3b873c2887814cbec03d029533e82f7db5", "short": "[ie] Support multi-period MPD streams (#6654)", "authors": ["alard", "pukkandan"] + }, + { + "action": "remove", + "when": "22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80" } ] diff --git a/devscripts/install_deps.py b/devscripts/install_deps.py index 889d9abeb716..d33fc637c6fa 100755 --- a/devscripts/install_deps.py +++ b/devscripts/install_deps.py @@ -10,6 +10,8 @@ import re import subprocess +from pathlib import Path + from devscripts.tomlparse import parse_toml from devscripts.utils import read_file @@ -17,17 +19,23 @@ def parse_args(): parser = argparse.ArgumentParser(description='Install dependencies for yt-dlp') parser.add_argument( - 'input', nargs='?', metavar='TOMLFILE', default='pyproject.toml', help='Input file (default: %(default)s)') + 'input', nargs='?', metavar='TOMLFILE', default=Path(__file__).parent.parent / 'pyproject.toml', + help='input file (default: %(default)s)') parser.add_argument( - '-e', '--exclude', metavar='DEPENDENCY', action='append', help='Exclude a dependency') + '-e', '--exclude', metavar='DEPENDENCY', action='append', + help='exclude a dependency') parser.add_argument( - '-i', '--include', metavar='GROUP', action='append', help='Include an optional dependency group') + '-i', '--include', metavar='GROUP', action='append', + help='include an optional dependency group') parser.add_argument( - '-o', '--only-optional', action='store_true', help='Only install optional dependencies') + '-o', '--only-optional', action='store_true', + help='only install optional dependencies') parser.add_argument( - '-p', '--print', action='store_true', help='Only print a requirements.txt to stdout') + '-p', '--print', action='store_true', + help='only print requirements to stdout') parser.add_argument( - '-u', '--user', action='store_true', help='Install with pip as --user') + '-u', '--user', action='store_true', + help='install with pip as --user') return parser.parse_args() @@ -37,24 +45,16 @@ def main(): optional_groups = project_table['optional-dependencies'] excludes = args.exclude or [] - deps = [] + targets = [] if not args.only_optional: # `-o` should exclude 'dependencies' and the 'default' group - deps.extend(project_table['dependencies']) + targets.extend(project_table['dependencies']) if 'default' not in excludes: # `--exclude default` should exclude entire 'default' group - deps.extend(optional_groups['default']) - - def name(dependency): - return re.match(r'[\w-]+', dependency)[0].lower() - - target_map = {name(dep): dep for dep in deps} + targets.extend(optional_groups['default']) for include in filter(None, map(optional_groups.get, args.include or [])): - target_map.update(zip(map(name, include), include)) - - for exclude in map(name, excludes): - target_map.pop(exclude, None) + targets.extend(include) - targets = list(target_map.values()) + targets = [t for t in targets if re.match(r'[\w-]+', t).group(0).lower() not in excludes] if args.print: for target in targets: diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 9b12e71e5f9d..009e7bba101a 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -24,7 +24,7 @@ # NAME -yt\-dlp \- A youtube-dl fork with additional features and patches +yt\-dlp \- A feature\-rich command\-line audio/video downloader # SYNOPSIS diff --git a/devscripts/tomlparse.py b/devscripts/tomlparse.py index 85ac4eef7893..ac9ea3170738 100755 --- a/devscripts/tomlparse.py +++ b/devscripts/tomlparse.py @@ -11,7 +11,7 @@ from __future__ import annotations -import datetime +import datetime as dt import json import re @@ -115,9 +115,9 @@ def parse_value(data: str, index: int): for func in [ int, float, - datetime.time.fromisoformat, - datetime.date.fromisoformat, - datetime.datetime.fromisoformat, + dt.time.fromisoformat, + dt.date.fromisoformat, + dt.datetime.fromisoformat, {'true': True, 'false': False}.get, ]: try: @@ -179,7 +179,7 @@ def main(): data = file.read() def default(obj): - if isinstance(obj, (datetime.date, datetime.time, datetime.datetime)): + if isinstance(obj, (dt.date, dt.time, dt.datetime)): return obj.isoformat() print(json.dumps(parse_toml(data), default=default)) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index da54a6a25882..07a071745866 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -9,15 +9,15 @@ import argparse import contextlib +import datetime as dt import sys -from datetime import datetime, timezone from devscripts.utils import read_version, run_process, write_file def get_new_version(version, revision): if not version: - version = datetime.now(timezone.utc).strftime('%Y.%m.%d') + version = dt.datetime.now(dt.timezone.utc).strftime('%Y.%m.%d') if revision: assert revision.isdecimal(), 'Revision must be a number' diff --git a/pyproject.toml b/pyproject.toml index 64504ff98665..9faf53b9c839 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ maintainers = [ {name = "bashonly", email = "bashonly@protonmail.com"}, {name = "coletdjnz", email = "coletdjnz@protonmail.com"}, ] -description = "A youtube-dl fork with additional features and patches" +description = "A feature-rich command-line audio/video downloader" readme = "README.md" requires-python = ">=3.8" keywords = [ @@ -53,6 +53,7 @@ dependencies = [ [project.optional-dependencies] default = [] +curl_cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] secretstorage = [ "cffi", "secretstorage", @@ -68,7 +69,10 @@ dev = [ "isort", "pytest", ] -pyinstaller = ["pyinstaller>=6.3"] +pyinstaller = [ + "pyinstaller>=6.3; sys_platform!='darwin'", + "pyinstaller==5.13.2; sys_platform=='darwin'", # needed for curl_cffi +] py2exe = ["py2exe>=0.12"] [project.urls] diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6be47af97f7c..5242cf88f92c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -183,7 +183,7 @@ def test_format_selection_audio_exts(self): ] info_dict = _make_result(formats) - ydl = YDL({'format': 'best'}) + ydl = YDL({'format': 'best', 'format_sort': ['abr', 'ext']}) ydl.sort_formats(info_dict) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] @@ -195,7 +195,7 @@ def test_format_selection_audio_exts(self): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'mp3-64') - ydl = YDL({'prefer_free_formats': True}) + ydl = YDL({'prefer_free_formats': True, 'format_sort': ['abr', 'ext']}) ydl.sort_formats(info_dict) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] diff --git a/test/test_cookies.py b/test/test_cookies.py index 5282ef6215d0..bd61f30a660d 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,5 +1,5 @@ +import datetime as dt import unittest -from datetime import datetime, timezone from yt_dlp import cookies from yt_dlp.cookies import ( @@ -138,7 +138,7 @@ def test_safari_cookie_parsing(self): self.assertEqual(cookie.name, 'foo') self.assertEqual(cookie.value, 'test%20%3Bcookie') self.assertFalse(cookie.secure) - expected_expiration = datetime(2021, 6, 18, 21, 39, 19, tzinfo=timezone.utc) + expected_expiration = dt.datetime(2021, 6, 18, 21, 39, 19, tzinfo=dt.timezone.utc) self.assertEqual(cookie.expires, int(expected_expiration.timestamp())) def test_pbkdf2_sha1(self): diff --git a/test/test_networking.py b/test/test_networking.py index 628f1f171111..b50f70d086a6 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -27,9 +27,10 @@ from email.message import Message from http.cookiejar import CookieJar +from test.conftest import validate_and_send from test.helper import FakeYDL, http_server_port, verify_address_availability from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli, requests, urllib3 +from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -50,10 +51,13 @@ TransportError, UnsupportedRequest, ) +from yt_dlp.networking.impersonate import ( + ImpersonateRequestHandler, + ImpersonateTarget, +) +from yt_dlp.utils import YoutubeDLError from yt_dlp.utils._utils import _YDLLogger as FakeLogger -from yt_dlp.utils.networking import HTTPHeaderDict - -from test.conftest import validate_and_send +from yt_dlp.utils.networking import HTTPHeaderDict, std_headers TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -75,6 +79,7 @@ def do_GET(self): class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' + default_request_version = 'HTTP/1.1' def log_message(self, format, *args): pass @@ -112,6 +117,8 @@ def _status(self, status): def _read_data(self): if 'Content-Length' in self.headers: return self.rfile.read(int(self.headers['Content-Length'])) + else: + return b'' def do_POST(self): data = self._read_data() + str(self.headers).encode() @@ -195,7 +202,8 @@ def do_GET(self): self._headers() elif self.path.startswith('/308-to-headers'): self.send_response(308) - self.send_header('Location', '/headers') + # redirect to "localhost" for testing cookie redirection handling + self.send_header('Location', f'http://localhost:{self.connection.getsockname()[1]}/headers') self.send_header('Content-Length', '0') self.end_headers() elif self.path == '/trailing_garbage': @@ -310,7 +318,7 @@ def setup_class(cls): class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -321,7 +329,7 @@ def test_verify_cert(self, handler): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -335,11 +343,11 @@ def test_ssl_error(self, handler): https_server_thread.start() with handler(verify=False) as rh: - with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: + with pytest.raises(SSLError, match=r'(?i)ssl(?:v3|/tls).alert.handshake.failure') as exc_info: validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -351,7 +359,7 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) @pytest.mark.parametrize('path', [ '/a/b/./../../headers', '/redirect_dotsegments', @@ -367,6 +375,7 @@ def test_remove_dot_segments(self, handler, path): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() + # Not supported by CurlCFFI (non-standard) @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: @@ -374,7 +383,7 @@ def test_unicode_path_redirection(self, handler): assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -384,7 +393,7 @@ def test_raise_http_error(self, handler): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -395,62 +404,50 @@ def test_response_url(self, handler): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) - def test_redirect(self, handler): + # Covers some basic cases we expect some level of consistency between request handlers for + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) + @pytest.mark.parametrize('redirect_status,method,expected', [ + # A 303 must either use GET or HEAD for subsequent request + (303, 'POST', ('', 'GET', False)), + (303, 'HEAD', ('', 'HEAD', False)), + + # 301 and 302 turn POST only into a GET + (301, 'POST', ('', 'GET', False)), + (301, 'HEAD', ('', 'HEAD', False)), + (302, 'POST', ('', 'GET', False)), + (302, 'HEAD', ('', 'HEAD', False)), + + # 307 and 308 should not change method + (307, 'POST', ('testdata', 'POST', True)), + (308, 'POST', ('testdata', 'POST', True)), + (307, 'HEAD', ('', 'HEAD', False)), + (308, 'HEAD', ('', 'HEAD', False)), + ]) + def test_redirect(self, handler, redirect_status, method, expected): with handler() as rh: - def do_req(redirect_status, method, assert_no_content=False): - data = b'testdata' if method in ('POST', 'PUT') else None - res = validate_and_send( - rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data)) - - headers = b'' - data_sent = b'' - if data is not None: - data_sent += res.read(len(data)) - if data_sent != data: - headers += data_sent - data_sent = b'' - - headers += res.read() - - if assert_no_content or data is None: - assert b'Content-Type' not in headers - assert b'Content-Length' not in headers - else: - assert b'Content-Type' in headers - assert b'Content-Length' in headers - - return data_sent.decode(), res.headers.get('method', '') - - # A 303 must either use GET or HEAD for subsequent request - assert do_req(303, 'POST', True) == ('', 'GET') - assert do_req(303, 'HEAD') == ('', 'HEAD') - - assert do_req(303, 'PUT', True) == ('', 'GET') - - # 301 and 302 turn POST only into a GET - assert do_req(301, 'POST', True) == ('', 'GET') - assert do_req(301, 'HEAD') == ('', 'HEAD') - assert do_req(302, 'POST', True) == ('', 'GET') - assert do_req(302, 'HEAD') == ('', 'HEAD') - - assert do_req(301, 'PUT') == ('testdata', 'PUT') - assert do_req(302, 'PUT') == ('testdata', 'PUT') + data = b'testdata' if method == 'POST' else None + headers = {} + if data is not None: + headers['Content-Type'] = 'application/test' + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data, + headers=headers)) - # 307 and 308 should not change method - for m in ('POST', 'PUT'): - assert do_req(307, m) == ('testdata', m) - assert do_req(308, m) == ('testdata', m) + headers = b'' + data_recv = b'' + if data is not None: + data_recv += res.read(len(data)) + if data_recv != data: + headers += data_recv + data_recv = b'' - assert do_req(307, 'HEAD') == ('', 'HEAD') - assert do_req(308, 'HEAD') == ('', 'HEAD') + headers += res.read() - # These should not redirect and instead raise an HTTPError - for code in (300, 304, 305, 306): - with pytest.raises(HTTPError): - do_req(code, 'GET') + assert expected[0] == data_recv.decode() + assert expected[1] == res.headers.get('method') + assert expected[2] == ('content-length' in headers.decode().lower()) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -459,16 +456,17 @@ def test_request_cookie_header(self, handler): rh, Request( f'http://127.0.0.1:{self.http_port}/headers', headers={'Cookie': 'test=test'})).read().decode() - assert 'Cookie: test=test' in res + assert 'cookie: test=test' in res.lower() # Specified Cookie header should be removed on any redirect res = validate_and_send( rh, Request( f'http://127.0.0.1:{self.http_port}/308-to-headers', - headers={'Cookie': 'test=test'})).read().decode() - assert 'Cookie: test=test' not in res + headers={'Cookie': 'test=test2'})).read().decode() + assert 'cookie: test=test2' not in res.lower() # Specified Cookie header should override global cookiejar for that request + # Whether cookies from the cookiejar is applied on the redirect is considered undefined for now cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( version=0, name='test', value='ytdlp', port=None, port_specified=False, @@ -478,23 +476,23 @@ def test_request_cookie_header(self, handler): with handler(cookiejar=cookiejar) as rh: data = validate_and_send( - rh, Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'cookie': 'test=test'})).read() - assert b'Cookie: test=ytdlp' not in data - assert b'Cookie: test=test' in data + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'cookie': 'test=test3'})).read() + assert b'cookie: test=ytdlp' not in data.lower() + assert b'cookie: test=test3' in data.lower() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: - with pytest.raises(IncompleteRead): + with pytest.raises(IncompleteRead, match='13 bytes read, 234221 more expected'): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -503,47 +501,66 @@ def test_cookies(self, handler): with handler(cookiejar=cookiejar) as rh: data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() - assert b'Cookie: test=ytdlp' in data + assert b'cookie: test=ytdlp' in data.lower() # Per request with handler() as rh: data = validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() - assert b'Cookie: test=ytdlp' in data + assert b'cookie: test=ytdlp' in data.lower() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: # Global Headers - data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() - assert b'Test1: test' in data + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read().lower() + assert b'test1: test' in data # Per request headers, merged with global data = validate_and_send(rh, Request( - f'http://127.0.0.1:{self.http_port}/headers', headers={'test2': 'changed', 'test3': 'test3'})).read() - assert b'Test1: test' in data - assert b'Test2: changed' in data - assert b'Test2: test2' not in data - assert b'Test3: test3' in data - - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) - def test_timeout(self, handler): + f'http://127.0.0.1:{self.http_port}/headers', headers={'test2': 'changed', 'test3': 'test3'})).read().lower() + assert b'test1: test' in data + assert b'test2: changed' in data + assert b'test2: test2' not in data + assert b'test3: test3' in data + + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) + def test_read_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through validate_and_send( - rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_3')) + rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1')) - with handler(timeout=0.5) as rh: + with handler(timeout=0.1) as rh: with pytest.raises(TransportError): validate_and_send( - rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1')) + rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_5')) # Per request timeout, should override handler timeout validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) + def test_connect_timeout(self, handler): + # nothing should be listening on this port + connect_timeout_url = 'http://10.255.255.255' + with handler(timeout=0.01) as rh: + now = time.time() + with pytest.raises(TransportError): + validate_and_send( + rh, Request(connect_timeout_url)) + assert 0.01 <= time.time() - now < 20 + + with handler() as rh: + with pytest.raises(TransportError): + # Per request timeout, should override handler timeout + now = time.time() + validate_and_send( + rh, Request(connect_timeout_url, extensions={'timeout': 0.01})) + assert 0.01 <= time.time() - now < 20 + + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' # on some systems these loopback addresses we need for testing may not be available @@ -554,6 +571,7 @@ def test_source_address(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data + # Not supported by CurlCFFI @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: @@ -571,7 +589,7 @@ def test_brotli(self, handler): assert res.headers.get('Content-Encoding') == 'br' assert res.read() == b'' - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_deflate(self, handler): with handler() as rh: res = validate_and_send( @@ -581,7 +599,7 @@ def test_deflate(self, handler): assert res.headers.get('Content-Encoding') == 'deflate' assert res.read() == b'' - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_gzip(self, handler): with handler() as rh: res = validate_and_send( @@ -591,7 +609,7 @@ def test_gzip(self, handler): assert res.headers.get('Content-Encoding') == 'gzip' assert res.read() == b'' - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_multiple_encodings(self, handler): with handler() as rh: for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): @@ -602,17 +620,18 @@ def test_multiple_encodings(self, handler): assert res.headers.get('Content-Encoding') == pair assert res.read() == b'' + # Not supported by curl_cffi @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unsupported_encoding(self, handler): with handler() as rh: res = validate_and_send( rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', - headers={'ytdl-encoding': 'unsupported'})) + headers={'ytdl-encoding': 'unsupported', 'Accept-Encoding': '*'})) assert res.headers.get('Content-Encoding') == 'unsupported' assert res.read() == b'raw' - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_read(self, handler): with handler() as rh: res = validate_and_send( @@ -620,9 +639,12 @@ def test_read(self, handler): assert res.readable() assert res.read(1) == b'H' assert res.read(3) == b'ost' + assert res.read().decode().endswith('\n\n') + assert res.read() == b'' class TestHTTPProxy(TestRequestHandlerBase): + # Note: this only tests http urls over non-CONNECT proxy @classmethod def setup_class(cls): super().setup_class() @@ -642,7 +664,7 @@ def setup_class(cls): cls.geo_proxy_thread.daemon = True cls.geo_proxy_thread.start() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_http_proxy(self, handler): http_proxy = f'http://127.0.0.1:{self.proxy_port}' geo_proxy = f'http://127.0.0.1:{self.geo_port}' @@ -668,7 +690,7 @@ def test_http_proxy(self, handler): assert res != f'normal: {real_url}' assert 'Accept' in res - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_noproxy(self, handler): with handler(proxies={'proxy': f'http://127.0.0.1:{self.proxy_port}'}) as rh: # NO_PROXY @@ -678,7 +700,7 @@ def test_noproxy(self, handler): 'utf-8') assert 'Accept' in nop_response - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_allproxy(self, handler): url = 'http://foo.com/bar' with handler() as rh: @@ -686,7 +708,7 @@ def test_allproxy(self, handler): 'utf-8') assert response == f'normal: {url}' - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_http_proxy_with_idn(self, handler): with handler(proxies={ 'http': f'http://127.0.0.1:{self.proxy_port}', @@ -698,7 +720,6 @@ def test_http_proxy_with_idn(self, handler): class TestClientCertificate: - @classmethod def setup_class(cls): certfn = os.path.join(TEST_DIR, 'testcert.pem') @@ -724,27 +745,27 @@ def _run_test(self, handler, **handler_kwargs): ) as rh: validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_certificate_combined_nopass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'), }) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_certificate_nocombined_nopass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'client.crt'), 'client_certificate_key': os.path.join(self.certdir, 'client.key'), }) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_certificate_combined_pass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'), 'client_certificate_password': 'foobar', }) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_certificate_nocombined_pass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'client.crt'), @@ -753,6 +774,18 @@ def test_certificate_nocombined_pass(self, handler): }) +@pytest.mark.parametrize('handler', ['CurlCFFI'], indirect=True) +class TestHTTPImpersonateRequestHandler(TestRequestHandlerBase): + def test_supported_impersonate_targets(self, handler): + with handler(headers=std_headers) as rh: + # note: this assumes the impersonate request handler supports the impersonate extension + for target in rh.supported_targets: + res = validate_and_send(rh, Request( + f'http://127.0.0.1:{self.http_port}/headers', extensions={'impersonate': target})) + assert res.status == 200 + assert std_headers['user-agent'].lower() not in res.read().decode().lower() + + class TestRequestHandlerMisc: """Misc generic tests for request handlers, not related to request or validation testing""" @pytest.mark.parametrize('handler,logger_name', [ @@ -931,6 +964,172 @@ def mock_close(*args, **kwargs): assert called +@pytest.mark.parametrize('handler', ['CurlCFFI'], indirect=True) +class TestCurlCFFIRequestHandler(TestRequestHandlerBase): + + @pytest.mark.parametrize('params,extensions', [ + ({}, {'impersonate': ImpersonateTarget('chrome')}), + ({'impersonate': ImpersonateTarget('chrome', '110')}, {}), + ({'impersonate': ImpersonateTarget('chrome', '99')}, {'impersonate': ImpersonateTarget('chrome', '110')}), + ]) + def test_impersonate(self, handler, params, extensions): + with handler(headers=std_headers, **params) as rh: + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions=extensions)).read().decode() + assert 'sec-ch-ua: "Chromium";v="110"' in res + # Check that user agent is added over ours + assert 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' in res + + def test_headers(self, handler): + with handler(headers=std_headers) as rh: + # Ensure curl-impersonate overrides our standard headers (usually added + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={ + 'impersonate': ImpersonateTarget('safari')}, headers={'x-custom': 'test', 'sec-fetch-mode': 'custom'})).read().decode().lower() + + assert std_headers['user-agent'].lower() not in res + assert std_headers['accept-language'].lower() not in res + assert std_headers['sec-fetch-mode'].lower() not in res + # other than UA, custom headers that differ from std_headers should be kept + assert 'sec-fetch-mode: custom' in res + assert 'x-custom: test' in res + # but when not impersonating don't remove std_headers + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'x-custom': 'test'})).read().decode().lower() + # std_headers should be present + for k, v in std_headers.items(): + assert f'{k}: {v}'.lower() in res + + @pytest.mark.parametrize('raised,expected,match', [ + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.PARTIAL_FILE), IncompleteRead, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.OPERATION_TIMEDOUT), TransportError, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.RECV_ERROR), TransportError, None), + ]) + def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match): + import curl_cffi.requests + + from yt_dlp.networking._curlcffi import CurlCFFIResponseAdapter + curl_res = curl_cffi.requests.Response() + res = CurlCFFIResponseAdapter(curl_res) + + def mock_read(*args, **kwargs): + try: + raise raised() + except Exception as e: + e.response = curl_res + raise + monkeypatch.setattr(res.fp, 'read', mock_read) + + with pytest.raises(expected, match=match) as exc_info: + res.read() + + assert exc_info.type is expected + + @pytest.mark.parametrize('raised,expected,match', [ + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.OPERATION_TIMEDOUT), TransportError, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.PEER_FAILED_VERIFICATION), CertificateVerifyError, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.SSL_CONNECT_ERROR), SSLError, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.TOO_MANY_REDIRECTS), HTTPError, None), + (lambda: curl_cffi.requests.errors.RequestsError( + '', code=curl_cffi.const.CurlECode.PROXY), ProxyError, None), + ]) + def test_request_error_mapping(self, handler, monkeypatch, raised, expected, match): + import curl_cffi.requests + curl_res = curl_cffi.requests.Response() + curl_res.status_code = 301 + + with handler() as rh: + original_get_instance = rh._get_instance + + def mock_get_instance(*args, **kwargs): + instance = original_get_instance(*args, **kwargs) + + def request(*_, **__): + try: + raise raised() + except Exception as e: + e.response = curl_res + raise + monkeypatch.setattr(instance, 'request', request) + return instance + + monkeypatch.setattr(rh, '_get_instance', mock_get_instance) + + with pytest.raises(expected) as exc_info: + rh.send(Request('http://fake')) + + assert exc_info.type is expected + + def test_response_reader(self, handler): + class FakeResponse: + def __init__(self, raise_error=False): + self.raise_error = raise_error + self.closed = False + + def iter_content(self): + yield b'foo' + yield b'bar' + yield b'z' + if self.raise_error: + raise Exception('test') + + def close(self): + self.closed = True + + from yt_dlp.networking._curlcffi import CurlCFFIResponseReader + + res = CurlCFFIResponseReader(FakeResponse()) + assert res.readable + assert res.bytes_read == 0 + assert res.read(1) == b'f' + assert res.bytes_read == 3 + assert res._buffer == b'oo' + + assert res.read(2) == b'oo' + assert res.bytes_read == 3 + assert res._buffer == b'' + + assert res.read(2) == b'ba' + assert res.bytes_read == 6 + assert res._buffer == b'r' + + assert res.read(3) == b'rz' + assert res.bytes_read == 7 + assert res._buffer == b'' + assert res.closed + assert res._response.closed + + # should handle no size param + res2 = CurlCFFIResponseReader(FakeResponse()) + assert res2.read() == b'foobarz' + assert res2.bytes_read == 7 + assert res2._buffer == b'' + assert res2.closed + + # should close on an exception + res3 = CurlCFFIResponseReader(FakeResponse(raise_error=True)) + with pytest.raises(Exception, match='test'): + res3.read() + assert res3._buffer == b'' + assert res3.bytes_read == 7 + assert res3.closed + + # buffer should be cleared on close + res4 = CurlCFFIResponseReader(FakeResponse()) + res4.read(2) + assert res4._buffer == b'o' + res4.close() + assert res4.closed + assert res4._buffer == b'' + + def run_validation(handler, error, req, **handler_kwargs): with handler(**handler_kwargs) as rh: if error: @@ -975,6 +1174,10 @@ class HTTPSupportedRH(ValidationRH): ('ws', False, {}), ('wss', False, {}), ]), + ('CurlCFFI', [ + ('http', False, {}), + ('https', False, {}), + ]), (NoCheckRH, [('http', False, {})]), (ValidationRH, [('http', UnsupportedRequest, {})]) ] @@ -998,6 +1201,14 @@ class HTTPSupportedRH(ValidationRH): ('socks5', False), ('socks5h', False), ]), + ('CurlCFFI', 'http', [ + ('http', False), + ('https', False), + ('socks4', False), + ('socks4a', False), + ('socks5', False), + ('socks5h', False), + ]), (NoCheckRH, 'http', [('http', False)]), (HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]), ('Websockets', 'ws', [('http', UnsupportedRequest)]), @@ -1015,6 +1226,10 @@ class HTTPSupportedRH(ValidationRH): ('all', False), ('unrelated', False), ]), + ('CurlCFFI', [ + ('all', False), + ('unrelated', False), + ]), (NoCheckRH, [('all', False)]), (HTTPSupportedRH, [('all', UnsupportedRequest)]), (HTTPSupportedRH, [('no', UnsupportedRequest)]), @@ -1036,6 +1251,19 @@ class HTTPSupportedRH(ValidationRH): ({'timeout': 'notatimeout'}, AssertionError), ({'unsupported': 'value'}, UnsupportedRequest), ]), + ('CurlCFFI', 'http', [ + ({'cookiejar': 'notacookiejar'}, AssertionError), + ({'cookiejar': YoutubeDLCookieJar()}, False), + ({'timeout': 1}, False), + ({'timeout': 'notatimeout'}, AssertionError), + ({'unsupported': 'value'}, UnsupportedRequest), + ({'impersonate': ImpersonateTarget('badtarget', None, None, None)}, UnsupportedRequest), + ({'impersonate': 123}, AssertionError), + ({'impersonate': ImpersonateTarget('chrome', None, None, None)}, False), + ({'impersonate': ImpersonateTarget(None, None, None, None)}, False), + ({'impersonate': ImpersonateTarget()}, False), + ({'impersonate': 'chrome'}, AssertionError) + ]), (NoCheckRH, 'http', [ ({'cookiejar': 'notacookiejar'}, False), ({'somerandom': 'test'}, False), # but any extension is allowed through @@ -1055,7 +1283,7 @@ class HTTPSupportedRH(ValidationRH): def test_url_scheme(self, handler, scheme, fail, handler_kwargs): run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {})) - @pytest.mark.parametrize('handler,fail', [('Urllib', False), ('Requests', False)], indirect=['handler']) + @pytest.mark.parametrize('handler,fail', [('Urllib', False), ('Requests', False), ('CurlCFFI', False)], indirect=['handler']) def test_no_proxy(self, handler, fail): run_validation(handler, fail, Request('http://', proxies={'no': '127.0.0.1,github.com'})) run_validation(handler, fail, Request('http://'), proxies={'no': '127.0.0.1,github.com'}) @@ -1078,13 +1306,13 @@ def test_proxy_scheme(self, handler, req_scheme, scheme, fail): run_validation(handler, fail, Request(f'{req_scheme}://', proxies={req_scheme: f'{scheme}://example.com'})) run_validation(handler, fail, Request(f'{req_scheme}://'), proxies={req_scheme: f'{scheme}://example.com'}) - @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests', 'CurlCFFI'], indirect=True) def test_empty_proxy(self, handler): run_validation(handler, False, Request('http://', proxies={'http': None})) run_validation(handler, False, Request('http://'), proxies={'http': None}) @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c']) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_invalid_proxy_url(self, handler, proxy_url): run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url})) @@ -1113,6 +1341,10 @@ def __init__(self, request): class FakeRH(RequestHandler): + def __init__(self, *args, **params): + self.params = params + super().__init__(*args, **params) + def _validate(self, request): return @@ -1271,15 +1503,10 @@ def test_compat_opener(self): ('', {'all': '__noproxy__'}), (None, {'http': 'http://127.0.0.1:8081', 'https': 'http://127.0.0.1:8081'}) # env, set https ]) - def test_proxy(self, proxy, expected): - old_http_proxy = os.environ.get('HTTP_PROXY') - try: - os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081' # ensure that provided proxies override env - with FakeYDL({'proxy': proxy}) as ydl: - assert ydl.proxies == expected - finally: - if old_http_proxy: - os.environ['HTTP_PROXY'] = old_http_proxy + def test_proxy(self, proxy, expected, monkeypatch): + monkeypatch.setenv('HTTP_PROXY', 'http://127.0.0.1:8081') + with FakeYDL({'proxy': proxy}) as ydl: + assert ydl.proxies == expected def test_compat_request(self): with FakeRHYDL() as ydl: @@ -1331,6 +1558,95 @@ def test_legacy_server_connect_error(self): with pytest.raises(SSLError, match='testerror'): ydl.urlopen('ssl://testerror') + def test_unsupported_impersonate_target(self): + class FakeImpersonationRHYDL(FakeYDL): + def __init__(self, *args, **kwargs): + class HTTPRH(RequestHandler): + def _send(self, request: Request): + pass + _SUPPORTED_URL_SCHEMES = ('http',) + _SUPPORTED_PROXY_SCHEMES = None + + super().__init__(*args, **kwargs) + self._request_director = self.build_request_director([HTTPRH]) + + with FakeImpersonationRHYDL() as ydl: + with pytest.raises( + RequestError, + match=r'Impersonate target "test" is not available' + ): + ydl.urlopen(Request('http://', extensions={'impersonate': ImpersonateTarget('test', None, None, None)})) + + def test_unsupported_impersonate_extension(self): + class FakeHTTPRHYDL(FakeYDL): + def __init__(self, *args, **kwargs): + class IRH(ImpersonateRequestHandler): + def _send(self, request: Request): + pass + + _SUPPORTED_URL_SCHEMES = ('http',) + _SUPPORTED_IMPERSONATE_TARGET_MAP = {ImpersonateTarget('abc',): 'test'} + _SUPPORTED_PROXY_SCHEMES = None + + super().__init__(*args, **kwargs) + self._request_director = self.build_request_director([IRH]) + + with FakeHTTPRHYDL() as ydl: + with pytest.raises( + RequestError, + match=r'Impersonate target "test" is not available' + ): + ydl.urlopen(Request('http://', extensions={'impersonate': ImpersonateTarget('test', None, None, None)})) + + def test_raise_impersonate_error(self): + with pytest.raises( + YoutubeDLError, + match=r'Impersonate target "test" is not available' + ): + FakeYDL({'impersonate': ImpersonateTarget('test', None, None, None)}) + + def test_pass_impersonate_param(self, monkeypatch): + + class IRH(ImpersonateRequestHandler): + def _send(self, request: Request): + pass + + _SUPPORTED_URL_SCHEMES = ('http',) + _SUPPORTED_IMPERSONATE_TARGET_MAP = {ImpersonateTarget('abc'): 'test'} + + # Bypass the check on initialize + brh = FakeYDL.build_request_director + monkeypatch.setattr(FakeYDL, 'build_request_director', lambda cls, handlers, preferences=None: brh(cls, handlers=[IRH])) + + with FakeYDL({ + 'impersonate': ImpersonateTarget('abc', None, None, None) + }) as ydl: + rh = self.build_handler(ydl, IRH) + assert rh.impersonate == ImpersonateTarget('abc', None, None, None) + + def test_get_impersonate_targets(self): + handlers = [] + for target_client in ('abc', 'xyz', 'asd'): + class TestRH(ImpersonateRequestHandler): + def _send(self, request: Request): + pass + _SUPPORTED_URL_SCHEMES = ('http',) + _SUPPORTED_IMPERSONATE_TARGET_MAP = {ImpersonateTarget(target_client,): 'test'} + RH_KEY = target_client + RH_NAME = target_client + handlers.append(TestRH) + + with FakeYDL() as ydl: + ydl._request_director = ydl.build_request_director(handlers) + assert set(ydl._get_available_impersonate_targets()) == { + (ImpersonateTarget('xyz'), 'xyz'), + (ImpersonateTarget('abc'), 'abc'), + (ImpersonateTarget('asd'), 'asd') + } + assert ydl._impersonate_target_available(ImpersonateTarget('abc')) + assert ydl._impersonate_target_available(ImpersonateTarget()) + assert not ydl._impersonate_target_available(ImpersonateTarget('zxy')) + @pytest.mark.parametrize('proxy_key,proxy_url,expected', [ ('http', '__noproxy__', None), ('no', '127.0.0.1,foo.bar', '127.0.0.1,foo.bar'), @@ -1341,23 +1657,17 @@ def test_legacy_server_connect_error(self): ('http', 'socks4://example.com', 'socks4://example.com'), ('unrelated', '/bad/proxy', '/bad/proxy'), # clean_proxies should ignore bad proxies ]) - def test_clean_proxy(self, proxy_key, proxy_url, expected): + def test_clean_proxy(self, proxy_key, proxy_url, expected, monkeypatch): # proxies should be cleaned in urlopen() with FakeRHYDL() as ydl: req = ydl.urlopen(Request('test://', proxies={proxy_key: proxy_url})).request assert req.proxies[proxy_key] == expected # and should also be cleaned when building the handler - env_key = f'{proxy_key.upper()}_PROXY' - old_env_proxy = os.environ.get(env_key) - try: - os.environ[env_key] = proxy_url # ensure that provided proxies override env - with FakeYDL() as ydl: - rh = self.build_handler(ydl) - assert rh.proxies[proxy_key] == expected - finally: - if old_env_proxy: - os.environ[env_key] = old_env_proxy + monkeypatch.setenv(f'{proxy_key.upper()}_PROXY', proxy_url) + with FakeYDL() as ydl: + rh = self.build_handler(ydl) + assert rh.proxies[proxy_key] == expected def test_clean_proxy_header(self): with FakeRHYDL() as ydl: @@ -1629,3 +1939,71 @@ def test_compat(self): assert res.geturl() == res.url assert res.info() is res.headers assert res.getheader('test') == res.get_header('test') + + +class TestImpersonateTarget: + @pytest.mark.parametrize('target_str,expected', [ + ('abc', ImpersonateTarget('abc', None, None, None)), + ('abc-120_esr', ImpersonateTarget('abc', '120_esr', None, None)), + ('abc-120:xyz', ImpersonateTarget('abc', '120', 'xyz', None)), + ('abc-120:xyz-5.6', ImpersonateTarget('abc', '120', 'xyz', '5.6')), + ('abc:xyz', ImpersonateTarget('abc', None, 'xyz', None)), + ('abc:', ImpersonateTarget('abc', None, None, None)), + ('abc-120:', ImpersonateTarget('abc', '120', None, None)), + (':xyz', ImpersonateTarget(None, None, 'xyz', None)), + (':xyz-6.5', ImpersonateTarget(None, None, 'xyz', '6.5')), + (':', ImpersonateTarget(None, None, None, None)), + ('', ImpersonateTarget(None, None, None, None)), + ]) + def test_target_from_str(self, target_str, expected): + assert ImpersonateTarget.from_str(target_str) == expected + + @pytest.mark.parametrize('target_str', [ + '-120', ':-12.0', '-12:-12', '-:-', + '::', 'a-c-d:', 'a-c-d:e-f-g', 'a:b:' + ]) + def test_target_from_invalid_str(self, target_str): + with pytest.raises(ValueError): + ImpersonateTarget.from_str(target_str) + + @pytest.mark.parametrize('target,expected', [ + (ImpersonateTarget('abc', None, None, None), 'abc'), + (ImpersonateTarget('abc', '120', None, None), 'abc-120'), + (ImpersonateTarget('abc', '120', 'xyz', None), 'abc-120:xyz'), + (ImpersonateTarget('abc', '120', 'xyz', '5'), 'abc-120:xyz-5'), + (ImpersonateTarget('abc', None, 'xyz', None), 'abc:xyz'), + (ImpersonateTarget('abc', '120', None, None), 'abc-120'), + (ImpersonateTarget('abc', '120', 'xyz', None), 'abc-120:xyz'), + (ImpersonateTarget('abc', None, 'xyz'), 'abc:xyz'), + (ImpersonateTarget(None, None, 'xyz', '6.5'), ':xyz-6.5'), + (ImpersonateTarget('abc', ), 'abc'), + (ImpersonateTarget(None, None, None, None), ''), + ]) + def test_str(self, target, expected): + assert str(target) == expected + + @pytest.mark.parametrize('args', [ + ('abc', None, None, '5'), + ('abc', '120', None, '5'), + (None, '120', None, None), + (None, '120', None, '5'), + (None, None, None, '5'), + (None, '120', 'xyz', '5'), + ]) + def test_invalid_impersonate_target(self, args): + with pytest.raises(ValueError): + ImpersonateTarget(*args) + + @pytest.mark.parametrize('target1,target2,is_in,is_eq', [ + (ImpersonateTarget('abc', None, None, None), ImpersonateTarget('abc', None, None, None), True, True), + (ImpersonateTarget('abc', None, None, None), ImpersonateTarget('abc', '120', None, None), True, False), + (ImpersonateTarget('abc', None, 'xyz', 'test'), ImpersonateTarget('abc', '120', 'xyz', None), True, False), + (ImpersonateTarget('abc', '121', 'xyz', 'test'), ImpersonateTarget('abc', '120', 'xyz', 'test'), False, False), + (ImpersonateTarget('abc'), ImpersonateTarget('abc', '120', 'xyz', 'test'), True, False), + (ImpersonateTarget('abc', '120', 'xyz', 'test'), ImpersonateTarget('abc'), True, False), + (ImpersonateTarget(), ImpersonateTarget('abc', '120', 'xyz'), True, False), + (ImpersonateTarget(), ImpersonateTarget(), True, True), + ]) + def test_impersonate_target_in(self, target1, target2, is_in, is_eq): + assert (target1 in target2) is is_in + assert (target1 == target2) is is_eq diff --git a/test/test_socks.py b/test/test_socks.py index cb22b61dc8a5..43d612d85d17 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -286,8 +286,14 @@ def ctx(request): return CTX_MAP[request.param]() +@pytest.mark.parametrize( + 'handler,ctx', [ + ('Urllib', 'http'), + ('Requests', 'http'), + ('Websockets', 'ws'), + ('CurlCFFI', 'http') + ], indirect=True) class TestSocks4Proxy: - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks4_no_auth(self, handler, ctx): with handler() as rh: with ctx.socks_server(Socks4ProxyHandler) as server_address: @@ -295,7 +301,6 @@ def test_socks4_no_auth(self, handler, ctx): rh, proxies={'all': f'socks4://{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks4_auth(self, handler, ctx): with handler() as rh: with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address: @@ -305,7 +310,6 @@ def test_socks4_auth(self, handler, ctx): rh, proxies={'all': f'socks4://user:@{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks4a_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: @@ -313,7 +317,6 @@ def test_socks4a_ipv4_target(self, handler, ctx): assert response['version'] == 4 assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1') - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks4a_domain_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: @@ -322,7 +325,6 @@ def test_socks4a_domain_target(self, handler, ctx): assert response['ipv4_address'] is None assert response['domain_address'] == 'localhost' - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -333,7 +335,6 @@ def test_ipv4_client_source_address(self, handler, ctx): assert response['client_address'][0] == source_address assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) @pytest.mark.parametrize('reply_code', [ Socks4CD.REQUEST_REJECTED_OR_FAILED, Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD, @@ -345,7 +346,6 @@ def test_socks4_errors(self, handler, ctx, reply_code): with pytest.raises(ProxyError): ctx.socks_info_request(rh) - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_ipv6_socks4_proxy(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks4://{server_address}'}) as rh: @@ -354,7 +354,6 @@ def test_ipv6_socks4_proxy(self, handler, ctx): assert response['ipv4_address'] == '127.0.0.1' assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_timeout(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh: @@ -362,9 +361,15 @@ def test_timeout(self, handler, ctx): ctx.socks_info_request(rh) +@pytest.mark.parametrize( + 'handler,ctx', [ + ('Urllib', 'http'), + ('Requests', 'http'), + ('Websockets', 'ws'), + ('CurlCFFI', 'http') + ], indirect=True) class TestSocks5Proxy: - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5_no_auth(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -372,7 +377,6 @@ def test_socks5_no_auth(self, handler, ctx): assert response['auth_methods'] == [0x0] assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5_user_pass(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address: with handler() as rh: @@ -385,7 +389,6 @@ def test_socks5_user_pass(self, handler, ctx): assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS] assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -393,7 +396,6 @@ def test_socks5_ipv4_target(self, handler, ctx): assert response['ipv4_address'] == '127.0.0.1' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -401,7 +403,6 @@ def test_socks5_domain_target(self, handler, ctx): assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1') assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5h_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: @@ -410,7 +411,6 @@ def test_socks5h_domain_target(self, handler, ctx): assert response['domain_address'] == 'localhost' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5h_ip_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: @@ -419,7 +419,6 @@ def test_socks5h_ip_target(self, handler, ctx): assert response['domain_address'] is None assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_socks5_ipv6_destination(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -427,7 +426,6 @@ def test_socks5_ipv6_destination(self, handler, ctx): assert response['ipv6_address'] == '::1' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_ipv6_socks5_proxy(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -438,7 +436,6 @@ def test_ipv6_socks5_proxy(self, handler, ctx): # XXX: is there any feasible way of testing IPv6 source addresses? # Same would go for non-proxy source_address test... - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -448,7 +445,6 @@ def test_ipv4_client_source_address(self, handler, ctx): assert response['client_address'][0] == source_address assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True) @pytest.mark.parametrize('reply_code', [ Socks5Reply.GENERAL_FAILURE, Socks5Reply.CONNECTION_NOT_ALLOWED, @@ -465,7 +461,6 @@ def test_socks5_errors(self, handler, ctx, reply_code): with pytest.raises(ProxyError): ctx.socks_info_request(rh) - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Websockets', 'ws')], indirect=True) def test_timeout(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, sleep=2) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}, timeout=1) as rh: diff --git a/test/test_traversal.py b/test/test_traversal.py new file mode 100644 index 000000000000..ed29d03ad562 --- /dev/null +++ b/test/test_traversal.py @@ -0,0 +1,442 @@ +import http.cookies +import re +import xml.etree.ElementTree + +import pytest + +from yt_dlp.utils import dict_get, int_or_none, str_or_none +from yt_dlp.utils.traversal import traverse_obj + +_TEST_DATA = { + 100: 100, + 1.2: 1.2, + 'str': 'str', + 'None': None, + '...': ..., + 'urls': [ + {'index': 0, 'url': 'https://www.example.com/0'}, + {'index': 1, 'url': 'https://www.example.com/1'}, + ], + 'data': ( + {'index': 2}, + {'index': 3}, + ), + 'dict': {}, +} + + +class TestTraversal: + def test_dict_get(self): + FALSE_VALUES = { + 'none': None, + 'false': False, + 'zero': 0, + 'empty_string': '', + 'empty_list': [], + } + d = {**FALSE_VALUES, 'a': 42} + assert dict_get(d, 'a') == 42 + assert dict_get(d, 'b') is None + assert dict_get(d, 'b', 42) == 42 + assert dict_get(d, ('a',)) == 42 + assert dict_get(d, ('b', 'a')) == 42 + assert dict_get(d, ('b', 'c', 'a', 'd')) == 42 + assert dict_get(d, ('b', 'c')) is None + assert dict_get(d, ('b', 'c'), 42) == 42 + for key, false_value in FALSE_VALUES.items(): + assert dict_get(d, ('b', 'c', key)) is None + assert dict_get(d, ('b', 'c', key), skip_false_values=False) == false_value + + def test_traversal_base(self): + assert traverse_obj(_TEST_DATA, ('str',)) == 'str', \ + 'allow tuple path' + assert traverse_obj(_TEST_DATA, ['str']) == 'str', \ + 'allow list path' + assert traverse_obj(_TEST_DATA, (value for value in ("str",))) == 'str', \ + 'allow iterable path' + assert traverse_obj(_TEST_DATA, 'str') == 'str', \ + 'single items should be treated as a path' + assert traverse_obj(_TEST_DATA, 100) == 100, \ + 'allow int path' + assert traverse_obj(_TEST_DATA, 1.2) == 1.2, \ + 'allow float path' + assert traverse_obj(_TEST_DATA, None) == _TEST_DATA, \ + '`None` should not perform any modification' + + def test_traversal_ellipsis(self): + assert traverse_obj(_TEST_DATA, ...) == [x for x in _TEST_DATA.values() if x not in (None, {})], \ + '`...` should give all non discarded values' + assert traverse_obj(_TEST_DATA, ('urls', 0, ...)) == list(_TEST_DATA['urls'][0].values()), \ + '`...` selection for dicts should select all values' + assert traverse_obj(_TEST_DATA, (..., ..., 'url')) == ['https://www.example.com/0', 'https://www.example.com/1'], \ + 'nested `...` queries should work' + assert traverse_obj(_TEST_DATA, (..., ..., 'index')) == list(range(4)), \ + '`...` query result should be flattened' + assert traverse_obj(iter(range(4)), ...) == list(range(4)), \ + '`...` should accept iterables' + + def test_traversal_function(self): + filter_func = lambda x, y: x == 'urls' and isinstance(y, list) + assert traverse_obj(_TEST_DATA, filter_func) == [_TEST_DATA['urls']], \ + 'function as query key should perform a filter based on (key, value)' + assert traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)) == ['str'], \ + 'exceptions in the query function should be catched' + assert traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0) == [0, 2], \ + 'function key should accept iterables' + # Wrong function signature should raise (debug mode) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, lambda a: ...) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, lambda a, b, c: ...) + + def test_traversal_set(self): + # transformation/type, like `expected_type` + assert traverse_obj(_TEST_DATA, (..., {str.upper}, )) == ['STR'], \ + 'Function in set should be a transformation' + assert traverse_obj(_TEST_DATA, (..., {str})) == ['str'], \ + 'Type in set should be a type filter' + assert traverse_obj(_TEST_DATA, (..., {str, int})) == [100, 'str'], \ + 'Multiple types in set should be a type filter' + assert traverse_obj(_TEST_DATA, {dict}) == _TEST_DATA, \ + 'A single set should be wrapped into a path' + assert traverse_obj(_TEST_DATA, (..., {str.upper})) == ['STR'], \ + 'Transformation function should not raise' + expected = [x for x in map(str_or_none, _TEST_DATA.values()) if x is not None] + assert traverse_obj(_TEST_DATA, (..., {str_or_none})) == expected, \ + 'Function in set should be a transformation' + assert traverse_obj(_TEST_DATA, ('fail', {lambda _: 'const'})) == 'const', \ + 'Function in set should always be called' + # Sets with length < 1 or > 1 not including only types should raise + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, set()) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, {str.upper, str}) + + def test_traversal_slice(self): + _SLICE_DATA = [0, 1, 2, 3, 4] + + assert traverse_obj(_TEST_DATA, ('dict', slice(1))) is None, \ + 'slice on a dictionary should not throw' + assert traverse_obj(_SLICE_DATA, slice(1)) == _SLICE_DATA[:1], \ + 'slice key should apply slice to sequence' + assert traverse_obj(_SLICE_DATA, slice(1, 2)) == _SLICE_DATA[1:2], \ + 'slice key should apply slice to sequence' + assert traverse_obj(_SLICE_DATA, slice(1, 4, 2)) == _SLICE_DATA[1:4:2], \ + 'slice key should apply slice to sequence' + + def test_traversal_alternatives(self): + assert traverse_obj(_TEST_DATA, 'fail', 'str') == 'str', \ + 'multiple `paths` should be treated as alternative paths' + assert traverse_obj(_TEST_DATA, 'str', 100) == 'str', \ + 'alternatives should exit early' + assert traverse_obj(_TEST_DATA, 'fail', 'fail') is None, \ + 'alternatives should return `default` if exhausted' + assert traverse_obj(_TEST_DATA, (..., 'fail'), 100) == 100, \ + 'alternatives should track their own branching return' + assert traverse_obj(_TEST_DATA, ('dict', ...), ('data', ...)) == list(_TEST_DATA['data']), \ + 'alternatives on empty objects should search further' + + def test_traversal_branching_nesting(self): + assert traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')) == ['https://www.example.com/0'], \ + 'tuple as key should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')) == ['https://www.example.com/0'], \ + 'list as key should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))) == ['https://www.example.com/0'], \ + 'double nesting in path should be treated as paths' + assert traverse_obj(['0', [1, 2]], [(0, 1), 0]) == [1], \ + 'do not fail early on branching' + expected = ['https://www.example.com/0', 'https://www.example.com/1'] + assert traverse_obj(_TEST_DATA, ('urls', ((0, ('fail', 'url')), (1, 'url')))) == expected, \ + 'tripple nesting in path should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', ('fail', (..., 'url')))) == expected, \ + 'ellipsis as branch path start gets flattened' + + def test_traversal_dict(self): + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}) == {0: 100, 1: 1.2}, \ + 'dict key should result in a dict with the same keys' + expected = {0: 'https://www.example.com/0'} + assert traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}) == expected, \ + 'dict key should allow paths' + expected = {0: ['https://www.example.com/0']} + assert traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}) == expected, \ + 'tuple in dict path should be treated as branches' + assert traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}) == expected, \ + 'double nesting in dict path should be treated as paths' + expected = {0: ['https://www.example.com/1', 'https://www.example.com/0']} + assert traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}) == expected, \ + 'tripple nesting in dict path should be treated as branches' + assert traverse_obj(_TEST_DATA, {0: 'fail'}) == {}, \ + 'remove `None` values when top level dict key fails' + assert traverse_obj(_TEST_DATA, {0: 'fail'}, default=...) == {0: ...}, \ + 'use `default` if key fails and `default`' + assert traverse_obj(_TEST_DATA, {0: 'dict'}) == {}, \ + 'remove empty values when dict key' + assert traverse_obj(_TEST_DATA, {0: 'dict'}, default=...) == {0: ...}, \ + 'use `default` when dict key and `default`' + assert traverse_obj(_TEST_DATA, {0: {0: 'fail'}}) == {}, \ + 'remove empty values when nested dict key fails' + assert traverse_obj(None, {0: 'fail'}) == {}, \ + 'default to dict if pruned' + assert traverse_obj(None, {0: 'fail'}, default=...) == {0: ...}, \ + 'default to dict if pruned and default is given' + assert traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=...) == {0: {0: ...}}, \ + 'use nested `default` when nested dict key fails and `default`' + assert traverse_obj(_TEST_DATA, {0: ('dict', ...)}) == {}, \ + 'remove key if branch in dict key not successful' + + def test_traversal_default(self): + _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} + + assert traverse_obj(_DEFAULT_DATA, 'fail') is None, \ + 'default value should be `None`' + assert traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=...) == ..., \ + 'chained fails should result in default' + assert traverse_obj(_DEFAULT_DATA, 'None', 'int') == 0, \ + 'should not short cirquit on `None`' + assert traverse_obj(_DEFAULT_DATA, 'fail', default=1) == 1, \ + 'invalid dict key should result in `default`' + assert traverse_obj(_DEFAULT_DATA, 'None', default=1) == 1, \ + '`None` is a deliberate sentinel and should become `default`' + assert traverse_obj(_DEFAULT_DATA, ('list', 10)) is None, \ + '`IndexError` should result in `default`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1) == 1, \ + 'if branched but not successful return `default` if defined, not `[]`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=None) is None, \ + 'if branched but not successful return `default` even if `default` is `None`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail')) == [], \ + 'if branched but not successful return `[]`, not `default`' + assert traverse_obj(_DEFAULT_DATA, ('list', ...)) == [], \ + 'if branched but object is empty return `[]`, not `default`' + assert traverse_obj(None, ...) == [], \ + 'if branched but object is `None` return `[]`, not `default`' + assert traverse_obj({0: None}, (0, ...)) == [], \ + 'if branched but state is `None` return `[]`, not `default`' + + @pytest.mark.parametrize('path', [ + ('fail', ...), + (..., 'fail'), + 100 * ('fail',) + (...,), + (...,) + 100 * ('fail',), + ]) + def test_traversal_branching(self, path): + assert traverse_obj({}, path) == [], \ + 'if branched but state is `None`, return `[]` (not `default`)' + assert traverse_obj({}, 'fail', path) == [], \ + 'if branching in last alternative and previous did not match, return `[]` (not `default`)' + assert traverse_obj({0: 'x'}, 0, path) == 'x', \ + 'if branching in last alternative and previous did match, return single value' + assert traverse_obj({0: 'x'}, path, 0) == 'x', \ + 'if branching in first alternative and non-branching path does match, return single value' + assert traverse_obj({}, path, 'fail') is None, \ + 'if branching in first alternative and non-branching path does not match, return `default`' + + def test_traversal_expected_type(self): + _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} + + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str) == 'str', \ + 'accept matching `expected_type` type' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int) is None, \ + 'reject non matching `expected_type` type' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)) == '0', \ + 'transform type using type function' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0) is None, \ + 'wrap expected_type fuction in try_call' + assert traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str) == ['str'], \ + 'eliminate items that expected_type fails on' + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int) == {0: 100}, \ + 'type as expected_type should filter dict values' + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none) == {0: '100', 1: '1.2'}, \ + 'function as expected_type should transform dict values' + assert traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int) == 1, \ + 'expected_type should not filter non final dict values' + assert traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int) == {0: {0: 100}}, \ + 'expected_type should transform deep dict values' + assert traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(...)) == [{0: ...}, {0: ...}], \ + 'expected_type should transform branched dict values' + assert traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int) == [4], \ + 'expected_type regression for type matching in tuple branching' + assert traverse_obj(_TEST_DATA, ['data', ...], expected_type=int) == [], \ + 'expected_type regression for type matching in dict result' + + def test_traversal_get_all(self): + _GET_ALL_DATA = {'key': [0, 1, 2]} + + assert traverse_obj(_GET_ALL_DATA, ('key', ...), get_all=False) == 0, \ + 'if not `get_all`, return only first matching value' + assert traverse_obj(_GET_ALL_DATA, ..., get_all=False) == [0, 1, 2], \ + 'do not overflatten if not `get_all`' + + def test_traversal_casesense(self): + _CASESENSE_DATA = { + 'KeY': 'value0', + 0: { + 'KeY': 'value1', + 0: {'KeY': 'value2'}, + }, + } + + assert traverse_obj(_CASESENSE_DATA, 'key') is None, \ + 'dict keys should be case sensitive unless `casesense`' + assert traverse_obj(_CASESENSE_DATA, 'keY', casesense=False) == 'value0', \ + 'allow non matching key case if `casesense`' + assert traverse_obj(_CASESENSE_DATA, [0, ('keY',)], casesense=False) == ['value1'], \ + 'allow non matching key case in branch if `casesense`' + assert traverse_obj(_CASESENSE_DATA, [0, ([0, 'keY'],)], casesense=False) == ['value2'], \ + 'allow non matching key case in branch path if `casesense`' + + def test_traversal_traverse_string(self): + _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} + + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)) is None, \ + 'do not traverse into string if not `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), traverse_string=True) == 's', \ + 'traverse into string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), traverse_string=True) == '.', \ + 'traverse into converted data if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...), traverse_string=True) == 'str', \ + '`...` should result in string (same value) if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), traverse_string=True) == 'sr', \ + '`slice` should result in string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), traverse_string=True) == 'str', \ + 'function should result in string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), traverse_string=True) == ['s', 'r'], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, ...), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, lambda x, y: True), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, slice(1)), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + + def test_traversal_re(self): + mobj = re.fullmatch(r'0(12)(?P3)(4)?', '0123') + assert traverse_obj(mobj, ...) == [x for x in mobj.groups() if x is not None], \ + '`...` on a `re.Match` should give its `groups()`' + assert traverse_obj(mobj, lambda k, _: k in (0, 2)) == ['0123', '3'], \ + 'function on a `re.Match` should give groupno, value starting at 0' + assert traverse_obj(mobj, 'group') == '3', \ + 'str key on a `re.Match` should give group with that name' + assert traverse_obj(mobj, 2) == '3', \ + 'int key on a `re.Match` should give group with that name' + assert traverse_obj(mobj, 'gRoUp', casesense=False) == '3', \ + 'str key on a `re.Match` should respect casesense' + assert traverse_obj(mobj, 'fail') is None, \ + 'failing str key on a `re.Match` should return `default`' + assert traverse_obj(mobj, 'gRoUpS', casesense=False) is None, \ + 'failing str key on a `re.Match` should return `default`' + assert traverse_obj(mobj, 8) is None, \ + 'failing int key on a `re.Match` should return `default`' + assert traverse_obj(mobj, lambda k, _: k in (0, 'group')) == ['0123', '3'], \ + 'function on a `re.Match` should give group name as well' + + def test_traversal_xml_etree(self): + etree = xml.etree.ElementTree.fromstring(''' + + + 1 + 2008 + 141100 + + + + + 4 + 2011 + 59900 + + + + 68 + 2011 + 13600 + + + + ''') + assert traverse_obj(etree, '') == etree, \ + 'empty str key should return the element itself' + assert traverse_obj(etree, 'country') == list(etree), \ + 'str key should lead all children with that tag name' + assert traverse_obj(etree, ...) == list(etree), \ + '`...` as key should return all children' + assert traverse_obj(etree, lambda _, x: x[0].text == '4') == [etree[1]], \ + 'function as key should get element as value' + assert traverse_obj(etree, lambda i, _: i == 1) == [etree[1]], \ + 'function as key should get index as key' + assert traverse_obj(etree, 0) == etree[0], \ + 'int key should return the nth child' + expected = ['Austria', 'Switzerland', 'Malaysia', 'Costa Rica', 'Colombia'] + assert traverse_obj(etree, './/neighbor/@name') == expected, \ + '`@` at end of path should give that attribute' + assert traverse_obj(etree, '//neighbor/@fail') == [None, None, None, None, None], \ + '`@` at end of path should give `None`' + assert traverse_obj(etree, ('//neighbor/@', 2)) == {'name': 'Malaysia', 'direction': 'N'}, \ + '`@` should give the full attribute dict' + assert traverse_obj(etree, '//year/text()') == ['2008', '2011', '2011'], \ + '`text()` at end of path should give the inner text' + assert traverse_obj(etree, '//*[@direction]/@direction') == ['E', 'W', 'N', 'W', 'E'], \ + 'full Python xpath features should be supported' + assert traverse_obj(etree, (0, '@name')) == 'Liechtenstein', \ + 'special transformations should act on current element' + assert traverse_obj(etree, ('country', 0, ..., 'text()', {int_or_none})) == [1, 2008, 141100], \ + 'special transformations should act on current element' + + def test_traversal_unbranching(self): + assert traverse_obj(_TEST_DATA, [(100, 1.2), all]) == [100, 1.2], \ + '`all` should give all results as list' + assert traverse_obj(_TEST_DATA, [(100, 1.2), any]) == 100, \ + '`any` should give the first result' + assert traverse_obj(_TEST_DATA, [100, all]) == [100], \ + '`all` should give list if non branching' + assert traverse_obj(_TEST_DATA, [100, any]) == 100, \ + '`any` should give single item if non branching' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100), all]) == [100], \ + '`all` should filter `None` and empty dict' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100), any]) == 100, \ + '`any` should filter `None` and empty dict' + assert traverse_obj(_TEST_DATA, [{ + 'all': [('dict', 'None', 100, 1.2), all], + 'any': [('dict', 'None', 100, 1.2), any], + }]) == {'all': [100, 1.2], 'any': 100}, \ + '`all`/`any` should apply to each dict path separately' + assert traverse_obj(_TEST_DATA, [{ + 'all': [('dict', 'None', 100, 1.2), all], + 'any': [('dict', 'None', 100, 1.2), any], + }], get_all=False) == {'all': [100, 1.2], 'any': 100}, \ + '`all`/`any` should apply to dict regardless of `get_all`' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), all, {float}]) is None, \ + '`all` should reset branching status' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), any, {float}]) is None, \ + '`any` should reset branching status' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), all, ..., {float}]) == [1.2], \ + '`all` should allow further branching' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 'urls', 'data'), any, ..., 'index']) == [0, 1], \ + '`any` should allow further branching' + + def test_traversal_morsel(self): + values = { + 'expires': 'a', + 'path': 'b', + 'comment': 'c', + 'domain': 'd', + 'max-age': 'e', + 'secure': 'f', + 'httponly': 'g', + 'version': 'h', + 'samesite': 'i', + } + morsel = http.cookies.Morsel() + morsel.set('item_key', 'item_value', 'coded_value') + morsel.update(values) + values['key'] = 'item_key' + values['value'] = 'item_value' + + for key, value in values.items(): + assert traverse_obj(morsel, key) == value, \ + 'Morsel should provide access to all values' + assert traverse_obj(morsel, ...) == list(values.values()), \ + '`...` should yield all values' + assert traverse_obj(morsel, lambda k, v: True) == list(values.values()), \ + 'function key should yield all values' + assert traverse_obj(morsel, [(None,), any]) == morsel, \ + 'Morsel should not be implicitly changed to dict on usage' diff --git a/test/test_utils.py b/test/test_utils.py index a3073f0e0ac2..71febeefd650 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,7 +2,6 @@ # Allow direct execution import os -import re import sys import unittest import warnings @@ -45,7 +44,6 @@ determine_ext, determine_file_encoding, dfxp2srt, - dict_get, encode_base_n, encode_compat_str, encodeFilename, @@ -106,13 +104,11 @@ sanitize_url, shell_quote, smuggle_url, - str_or_none, str_to_int, strip_jsonp, strip_or_none, subtitles_filename, timeconvert, - traverse_obj, try_call, unescapeHTML, unified_strdate, @@ -755,28 +751,6 @@ def test_multipart_encode(self): self.assertRaises( ValueError, multipart_encode, {b'field': b'value'}, boundary='value') - def test_dict_get(self): - FALSE_VALUES = { - 'none': None, - 'false': False, - 'zero': 0, - 'empty_string': '', - 'empty_list': [], - } - d = FALSE_VALUES.copy() - d['a'] = 42 - self.assertEqual(dict_get(d, 'a'), 42) - self.assertEqual(dict_get(d, 'b'), None) - self.assertEqual(dict_get(d, 'b', 42), 42) - self.assertEqual(dict_get(d, ('a', )), 42) - self.assertEqual(dict_get(d, ('b', 'a', )), 42) - self.assertEqual(dict_get(d, ('b', 'c', 'a', 'd', )), 42) - self.assertEqual(dict_get(d, ('b', 'c', )), None) - self.assertEqual(dict_get(d, ('b', 'c', ), 42), 42) - for key, false_value in FALSE_VALUES.items(): - self.assertEqual(dict_get(d, ('b', 'c', key, )), None) - self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) - def test_merge_dicts(self): self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) @@ -2039,359 +2013,6 @@ def test_variadic(self): warnings.simplefilter('ignore') self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') - def test_traverse_obj(self): - _TEST_DATA = { - 100: 100, - 1.2: 1.2, - 'str': 'str', - 'None': None, - '...': ..., - 'urls': [ - {'index': 0, 'url': 'https://www.example.com/0'}, - {'index': 1, 'url': 'https://www.example.com/1'}, - ], - 'data': ( - {'index': 2}, - {'index': 3}, - ), - 'dict': {}, - } - - # Test base functionality - self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', - msg='allow tuple path') - self.assertEqual(traverse_obj(_TEST_DATA, ['str']), 'str', - msg='allow list path') - self.assertEqual(traverse_obj(_TEST_DATA, (value for value in ("str",))), 'str', - msg='allow iterable path') - self.assertEqual(traverse_obj(_TEST_DATA, 'str'), 'str', - msg='single items should be treated as a path') - self.assertEqual(traverse_obj(_TEST_DATA, None), _TEST_DATA) - self.assertEqual(traverse_obj(_TEST_DATA, 100), 100) - self.assertEqual(traverse_obj(_TEST_DATA, 1.2), 1.2) - - # Test Ellipsis behavior - self.assertCountEqual(traverse_obj(_TEST_DATA, ...), - (item for item in _TEST_DATA.values() if item not in (None, {})), - msg='`...` should give all non discarded values') - self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, ...)), _TEST_DATA['urls'][0].values(), - msg='`...` selection for dicts should select all values') - self.assertEqual(traverse_obj(_TEST_DATA, (..., ..., 'url')), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='nested `...` queries should work') - self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), - msg='`...` query result should be flattened') - self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)), - msg='`...` should accept iterables') - - # Test function as key - self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), - [_TEST_DATA['urls']], - msg='function as query key should perform a filter based on (key, value)') - self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, - msg='exceptions in the query function should be catched') - self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], - msg='function key should accept iterables') - if __debug__: - with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): - traverse_obj(_TEST_DATA, lambda a: ...) - with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): - traverse_obj(_TEST_DATA, lambda a, b, c: ...) - - # Test set as key (transformation/type, like `expected_type`) - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper}, )), ['STR'], - msg='Function in set should be a transformation') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str})), ['str'], - msg='Type in set should be a type filter') - self.assertEqual(traverse_obj(_TEST_DATA, {dict}), _TEST_DATA, - msg='A single set should be wrapped into a path') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper})), ['STR'], - msg='Transformation function should not raise') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str_or_none})), - [item for item in map(str_or_none, _TEST_DATA.values()) if item is not None], - msg='Function in set should be a transformation') - self.assertEqual(traverse_obj(_TEST_DATA, ('fail', {lambda _: 'const'})), 'const', - msg='Function in set should always be called') - if __debug__: - with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): - traverse_obj(_TEST_DATA, set()) - with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): - traverse_obj(_TEST_DATA, {str.upper, str}) - - # Test `slice` as a key - _SLICE_DATA = [0, 1, 2, 3, 4] - self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None, - msg='slice on a dictionary should not throw') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1], - msg='slice key should apply slice to sequence') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2], - msg='slice key should apply slice to sequence') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2], - msg='slice key should apply slice to sequence') - - # Test alternative paths - self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', - msg='multiple `paths` should be treated as alternative paths') - self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', - msg='alternatives should exit early') - self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, - msg='alternatives should return `default` if exhausted') - self.assertEqual(traverse_obj(_TEST_DATA, (..., 'fail'), 100), 100, - msg='alternatives should track their own branching return') - self.assertEqual(traverse_obj(_TEST_DATA, ('dict', ...), ('data', ...)), list(_TEST_DATA['data']), - msg='alternatives on empty objects should search further') - - # Test branch and path nesting - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], - msg='tuple as key should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')), ['https://www.example.com/0'], - msg='list as key should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))), ['https://www.example.com/0'], - msg='double nesting in path should be treated as paths') - self.assertEqual(traverse_obj(['0', [1, 2]], [(0, 1), 0]), [1], - msg='do not fail early on branching') - self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', ((1, ('fail', 'url')), (0, 'url')))), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='tripple nesting in path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ('fail', (..., 'url')))), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='ellipsis as branch path start gets flattened') - - # Test dictionary as key - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}), {0: 100, 1: 1.2}, - msg='dict key should result in a dict with the same keys') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}), - {0: 'https://www.example.com/0'}, - msg='dict key should allow paths') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}), - {0: ['https://www.example.com/0']}, - msg='tuple in dict path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}), - {0: ['https://www.example.com/0']}, - msg='double nesting in dict path should be treated as paths') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), - {0: ['https://www.example.com/1', 'https://www.example.com/0']}, - msg='tripple nesting in dict path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, - msg='remove `None` values when top level dict key fails') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=...), {0: ...}, - msg='use `default` if key fails and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {}, - msg='remove empty values when dict key') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=...), {0: ...}, - msg='use `default` when dict key and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {}, - msg='remove empty values when nested dict key fails') - self.assertEqual(traverse_obj(None, {0: 'fail'}), {}, - msg='default to dict if pruned') - self.assertEqual(traverse_obj(None, {0: 'fail'}, default=...), {0: ...}, - msg='default to dict if pruned and default is given') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=...), {0: {0: ...}}, - msg='use nested `default` when nested dict key fails and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', ...)}), {}, - msg='remove key if branch in dict key not successful') - - # Testing default parameter behavior - _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail'), None, - msg='default value should be `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=...), ..., - msg='chained fails should result in default') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', 'int'), 0, - msg='should not short cirquit on `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', default=1), 1, - msg='invalid dict key should result in `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', default=1), 1, - msg='`None` is a deliberate sentinel and should become `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, - msg='`IndexError` should result in `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1), 1, - msg='if branched but not successful return `default` if defined, not `[]`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=None), None, - msg='if branched but not successful return `default` even if `default` is `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail')), [], - msg='if branched but not successful return `[]`, not `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', ...)), [], - msg='if branched but object is empty return `[]`, not `default`') - self.assertEqual(traverse_obj(None, ...), [], - msg='if branched but object is `None` return `[]`, not `default`') - self.assertEqual(traverse_obj({0: None}, (0, ...)), [], - msg='if branched but state is `None` return `[]`, not `default`') - - branching_paths = [ - ('fail', ...), - (..., 'fail'), - 100 * ('fail',) + (...,), - (...,) + 100 * ('fail',), - ] - for branching_path in branching_paths: - self.assertEqual(traverse_obj({}, branching_path), [], - msg='if branched but state is `None`, return `[]` (not `default`)') - self.assertEqual(traverse_obj({}, 'fail', branching_path), [], - msg='if branching in last alternative and previous did not match, return `[]` (not `default`)') - self.assertEqual(traverse_obj({0: 'x'}, 0, branching_path), 'x', - msg='if branching in last alternative and previous did match, return single value') - self.assertEqual(traverse_obj({0: 'x'}, branching_path, 0), 'x', - msg='if branching in first alternative and non-branching path does match, return single value') - self.assertEqual(traverse_obj({}, branching_path, 'fail'), None, - msg='if branching in first alternative and non-branching path does not match, return `default`') - - # Testing expected_type behavior - _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), - 'str', msg='accept matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), - None, msg='reject non matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), - '0', msg='transform type using type function') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0), - None, msg='wrap expected_type fuction in try_call') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str), - ['str'], msg='eliminate items that expected_type fails on') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int), - {0: 100}, msg='type as expected_type should filter dict values') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none), - {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values') - self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int), - 1, msg='expected_type should not filter non final dict values') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int), - {0: {0: 100}}, msg='expected_type should transform deep dict values') - self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(...)), - [{0: ...}, {0: ...}], msg='expected_type should transform branched dict values') - self.assertEqual(traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int), - [4], msg='expected_type regression for type matching in tuple branching') - self.assertEqual(traverse_obj(_TEST_DATA, ['data', ...], expected_type=int), - [], msg='expected_type regression for type matching in dict result') - - # Test get_all behavior - _GET_ALL_DATA = {'key': [0, 1, 2]} - self.assertEqual(traverse_obj(_GET_ALL_DATA, ('key', ...), get_all=False), 0, - msg='if not `get_all`, return only first matching value') - self.assertEqual(traverse_obj(_GET_ALL_DATA, ..., get_all=False), [0, 1, 2], - msg='do not overflatten if not `get_all`') - - # Test casesense behavior - _CASESENSE_DATA = { - 'KeY': 'value0', - 0: { - 'KeY': 'value1', - 0: {'KeY': 'value2'}, - }, - } - self.assertEqual(traverse_obj(_CASESENSE_DATA, 'key'), None, - msg='dict keys should be case sensitive unless `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, 'keY', - casesense=False), 'value0', - msg='allow non matching key case if `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ('keY',)), - casesense=False), ['value1'], - msg='allow non matching key case in branch if `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ((0, 'keY'),)), - casesense=False), ['value2'], - msg='allow non matching key case in branch path if `casesense`') - - # Test traverse_string behavior - _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)), None, - msg='do not traverse into string if not `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), - traverse_string=True), 's', - msg='traverse into string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), - traverse_string=True), '.', - msg='traverse into converted data if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...), - traverse_string=True), 'str', - msg='`...` should result in string (same value) if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), - traverse_string=True), 'sr', - msg='`slice` should result in string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), - traverse_string=True), 'str', - msg='function should result in string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), - traverse_string=True), ['s', 'r'], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - - # Test re.Match as input obj - mobj = re.fullmatch(r'0(12)(?P3)(4)?', '0123') - self.assertEqual(traverse_obj(mobj, ...), [x for x in mobj.groups() if x is not None], - msg='`...` on a `re.Match` should give its `groups()`') - self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 2)), ['0123', '3'], - msg='function on a `re.Match` should give groupno, value starting at 0') - self.assertEqual(traverse_obj(mobj, 'group'), '3', - msg='str key on a `re.Match` should give group with that name') - self.assertEqual(traverse_obj(mobj, 2), '3', - msg='int key on a `re.Match` should give group with that name') - self.assertEqual(traverse_obj(mobj, 'gRoUp', casesense=False), '3', - msg='str key on a `re.Match` should respect casesense') - self.assertEqual(traverse_obj(mobj, 'fail'), None, - msg='failing str key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, 'gRoUpS', casesense=False), None, - msg='failing str key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, 8), None, - msg='failing int key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'], - msg='function on a `re.Match` should give group name as well') - - # Test xml.etree.ElementTree.Element as input obj - etree = xml.etree.ElementTree.fromstring(''' - - - 1 - 2008 - 141100 - - - - - 4 - 2011 - 59900 - - - - 68 - 2011 - 13600 - - - - ''') - self.assertEqual(traverse_obj(etree, ''), etree, - msg='empty str key should return the element itself') - self.assertEqual(traverse_obj(etree, 'country'), list(etree), - msg='str key should lead all children with that tag name') - self.assertEqual(traverse_obj(etree, ...), list(etree), - msg='`...` as key should return all children') - self.assertEqual(traverse_obj(etree, lambda _, x: x[0].text == '4'), [etree[1]], - msg='function as key should get element as value') - self.assertEqual(traverse_obj(etree, lambda i, _: i == 1), [etree[1]], - msg='function as key should get index as key') - self.assertEqual(traverse_obj(etree, 0), etree[0], - msg='int key should return the nth child') - self.assertEqual(traverse_obj(etree, './/neighbor/@name'), - ['Austria', 'Switzerland', 'Malaysia', 'Costa Rica', 'Colombia'], - msg='`@` at end of path should give that attribute') - self.assertEqual(traverse_obj(etree, '//neighbor/@fail'), [None, None, None, None, None], - msg='`@` at end of path should give `None`') - self.assertEqual(traverse_obj(etree, ('//neighbor/@', 2)), {'name': 'Malaysia', 'direction': 'N'}, - msg='`@` should give the full attribute dict') - self.assertEqual(traverse_obj(etree, '//year/text()'), ['2008', '2011', '2011'], - msg='`text()` at end of path should give the inner text') - self.assertEqual(traverse_obj(etree, '//*[@direction]/@direction'), ['E', 'W', 'N', 'W', 'E'], - msg='full Python xpath features should be supported') - self.assertEqual(traverse_obj(etree, (0, '@name')), 'Liechtenstein', - msg='special transformations should act on current element') - self.assertEqual(traverse_obj(etree, ('country', 0, ..., 'text()', {int_or_none})), [1, 2008, 141100], - msg='special transformations should act on current element') - def test_http_header_dict(self): headers = HTTPHeaderDict() headers['ytdl-test'] = b'0' diff --git a/test/test_websockets.py b/test/test_websockets.py index 13b3a1e76fb8..b294b0932b90 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -32,8 +32,6 @@ ) from yt_dlp.utils.networking import HTTPHeaderDict -from test.conftest import validate_and_send - TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -66,7 +64,9 @@ def process_request(self, request): def create_websocket_server(**ws_kwargs): import websockets.sync.server - wsd = websockets.sync.server.serve(websocket_handler, '127.0.0.1', 0, process_request=process_request, **ws_kwargs) + wsd = websockets.sync.server.serve( + websocket_handler, '127.0.0.1', 0, + process_request=process_request, open_timeout=2, **ws_kwargs) ws_port = wsd.socket.getsockname()[1] ws_server_thread = threading.Thread(target=wsd.serve_forever) ws_server_thread.daemon = True @@ -100,6 +100,19 @@ def create_mtls_wss_websocket_server(): return create_websocket_server(ssl_context=sslctx) +def ws_validate_and_send(rh, req): + rh.validate(req) + max_tries = 3 + for i in range(max_tries): + try: + return rh.send(req) + except TransportError as e: + if i < (max_tries - 1) and 'connection closed during handshake' in str(e): + # websockets server sometimes hangs on new connections + continue + raise + + @pytest.mark.skipif(not websockets, reason='websockets must be installed to test websocket request handlers') class TestWebsSocketRequestHandlerConformance: @classmethod @@ -119,7 +132,7 @@ def setup_class(cls): @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_basic_websockets(self, handler): with handler() as rh: - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) assert 'upgrade' in ws.headers assert ws.status == 101 ws.send('foo') @@ -131,7 +144,7 @@ def test_basic_websockets(self, handler): @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_send_types(self, handler, msg, opcode): with handler() as rh: - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send(msg) assert int(ws.recv()) == opcode ws.close() @@ -140,10 +153,10 @@ def test_send_types(self, handler, msg, opcode): def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): - validate_and_send(rh, Request(self.wss_base_url)) + ws_validate_and_send(rh, Request(self.wss_base_url)) with handler(verify=False) as rh: - ws = validate_and_send(rh, Request(self.wss_base_url)) + ws = ws_validate_and_send(rh, Request(self.wss_base_url)) assert ws.status == 101 ws.close() @@ -151,7 +164,7 @@ def test_verify_cert(self, handler): def test_ssl_error(self, handler): with handler(verify=False) as rh: with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: - validate_and_send(rh, Request(self.bad_wss_host)) + ws_validate_and_send(rh, Request(self.bad_wss_host)) assert not issubclass(exc_info.type, CertificateVerifyError) @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) @@ -163,7 +176,7 @@ def test_ssl_error(self, handler): ]) def test_percent_encode(self, handler, path, expected): with handler() as rh: - ws = validate_and_send(rh, Request(f'{self.ws_base_url}{path}')) + ws = ws_validate_and_send(rh, Request(f'{self.ws_base_url}{path}')) ws.send('path') assert ws.recv() == expected assert ws.status == 101 @@ -174,7 +187,7 @@ def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, # but it should be enough to check whether the handler is removing dot segments - ws = validate_and_send(rh, Request(f'{self.ws_base_url}/a/b/./../../test')) + ws = ws_validate_and_send(rh, Request(f'{self.ws_base_url}/a/b/./../../test')) assert ws.status == 101 ws.send('path') assert ws.recv() == '/test' @@ -187,7 +200,7 @@ def test_remove_dot_segments(self, handler): def test_raise_http_error(self, handler, status): with handler() as rh: with pytest.raises(HTTPError) as exc_info: - validate_and_send(rh, Request(f'{self.ws_base_url}/gen_{status}')) + ws_validate_and_send(rh, Request(f'{self.ws_base_url}/gen_{status}')) assert exc_info.value.status == status @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) @@ -198,7 +211,7 @@ def test_raise_http_error(self, handler, status): def test_timeout(self, handler, params, extensions): with handler(**params) as rh: with pytest.raises(TransportError): - validate_and_send(rh, Request(self.ws_base_url, extensions=extensions)) + ws_validate_and_send(rh, Request(self.ws_base_url, extensions=extensions)) @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_cookies(self, handler): @@ -210,18 +223,18 @@ def test_cookies(self, handler): comment_url=None, rest={})) with handler(cookiejar=cookiejar) as rh: - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' ws.close() with handler() as rh: - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') assert 'cookie' not in json.loads(ws.recv()) ws.close() - ws = validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': cookiejar})) + ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': cookiejar})) ws.send('headers') assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' ws.close() @@ -231,7 +244,7 @@ def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' verify_address_availability(source_address) with handler(source_address=source_address) as rh: - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('source_address') assert source_address == ws.recv() ws.close() @@ -240,7 +253,7 @@ def test_source_address(self, handler): def test_response_url(self, handler): with handler() as rh: url = f'{self.ws_base_url}/something' - ws = validate_and_send(rh, Request(url)) + ws = ws_validate_and_send(rh, Request(url)) assert ws.url == url ws.close() @@ -248,14 +261,14 @@ def test_response_url(self, handler): def test_request_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: # Global Headers - ws = validate_and_send(rh, Request(self.ws_base_url)) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') headers = HTTPHeaderDict(json.loads(ws.recv())) assert headers['test1'] == 'test' ws.close() # Per request headers, merged with global - ws = validate_and_send(rh, Request( + ws = ws_validate_and_send(rh, Request( self.ws_base_url, headers={'test2': 'changed', 'test3': 'test3'})) ws.send('headers') headers = HTTPHeaderDict(json.loads(ws.recv())) @@ -288,7 +301,7 @@ def test_mtls(self, handler, client_cert): verify=False, client_cert=client_cert ) as rh: - validate_and_send(rh, Request(self.mtls_wss_base_url)).close() + ws_validate_and_send(rh, Request(self.mtls_wss_base_url)).close() def create_fake_ws_connection(raised): diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index c34d97bba1b0..291fc8d00cdc 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1,7 +1,7 @@ import collections import contextlib import copy -import datetime +import datetime as dt import errno import fileinput import http.cookiejar @@ -42,6 +42,7 @@ SSLError, network_exceptions, ) +from .networking.impersonate import ImpersonateRequestHandler from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_pps from .postprocessor import ( @@ -99,6 +100,7 @@ SameFileError, UnavailableVideoError, UserNotLive, + YoutubeDLError, age_restricted, args_to_str, bug_reports_message, @@ -144,6 +146,7 @@ subtitles_filename, supports_terminal_sequences, system_identifier, + filesize_from_tbr, timetuple_from_msec, to_high_limit_path, traverse_obj, @@ -402,6 +405,8 @@ class YoutubeDL: - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) source_address: Client-side IP address to bind to. + impersonate: Client to impersonate for requests. + An ImpersonateTarget (from yt_dlp.networking.impersonate) sleep_interval_requests: Number of seconds to sleep between requests during extraction sleep_interval: Number of seconds to sleep before each download when @@ -713,6 +718,13 @@ def check_deprecated(param, option, suggestion): for msg in self.params.get('_deprecation_warnings', []): self.deprecated_feature(msg) + if impersonate_target := self.params.get('impersonate'): + if not self._impersonate_target_available(impersonate_target): + raise YoutubeDLError( + f'Impersonate target "{impersonate_target}" is not available. ' + f'Use --list-impersonate-targets to see available targets. ' + f'You may be missing dependencies required to support this target.') + if 'list-formats' in self.params['compat_opts']: self.params['listformats_table'] = False @@ -2617,7 +2629,7 @@ def _fill_common_fields(self, info_dict, final=True): # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) with contextlib.suppress(ValueError, OverflowError, OSError): - upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc) + upload_date = dt.datetime.fromtimestamp(info_dict[ts_key], dt.timezone.utc) info_dict[date_key] = upload_date.strftime('%Y%m%d') if not info_dict.get('release_year'): @@ -2771,7 +2783,7 @@ def sanitize_numeric_fields(info): get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: - info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + info_dict['title'] += ' ' + dt.datetime.now().strftime('%Y-%m-%d %H:%M') if info_dict.get('is_live') and formats: formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] if get_from_start and not formats: @@ -2802,6 +2814,9 @@ def is_wellformed(f): format['url'] = sanitize_url(format['url']) if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() + if format['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): + if format.get('acodec') is None: + format['acodec'] = format['ext'] if format.get('protocol') is None: format['protocol'] = determine_protocol(format) if format.get('resolution') is None: @@ -2812,9 +2827,8 @@ def is_wellformed(f): format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) # For fragmented formats, "tbr" is often max bitrate and not average if (('manifest-filesize-approx' in self.params['compat_opts'] or not format.get('manifest_url')) - and info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): - format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) + format['filesize_approx'] = filesize_from_tbr(format.get('tbr'), info_dict.get('duration')) format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True) # Safeguard against old/insecure infojson when using --load-info-json @@ -3864,8 +3878,8 @@ def simplified_codec(f, field): delim, ( format_field(f, 'filesize', ' \t%s', func=format_bytes) or format_field(f, 'filesize_approx', 'ā‰ˆ\t%s', func=format_bytes) - or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))), - None, self._format_out('~\t%s', self.Styles.SUPPRESS))), + or format_field(filesize_from_tbr(f.get('tbr'), info_dict.get('duration')), None, + self._format_out('~\t%s', self.Styles.SUPPRESS), func=format_bytes)), format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, @@ -4077,6 +4091,22 @@ def _opener(self): handler = self._request_director.handlers['Urllib'] return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies) + def _get_available_impersonate_targets(self): + # todo(future): make available as public API + return [ + (target, rh.RH_NAME) + for rh in self._request_director.handlers.values() + if isinstance(rh, ImpersonateRequestHandler) + for target in rh.supported_targets + ] + + def _impersonate_target_available(self, target): + # todo(future): make available as public API + return any( + rh.is_supported_target(target) + for rh in self._request_director.handlers.values() + if isinstance(rh, ImpersonateRequestHandler)) + def urlopen(self, req): """ Start an HTTP download """ if isinstance(req, str): @@ -4108,9 +4138,13 @@ def urlopen(self, req): raise RequestError( 'file:// URLs are disabled by default in yt-dlp for security reasons. ' 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue - if 'unsupported proxy type: "https"' in ue.msg.lower(): + if ( + 'unsupported proxy type: "https"' in ue.msg.lower() + and 'requests' not in self._request_director.handlers + and 'curl_cffi' not in self._request_director.handlers + ): raise RequestError( - 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests') + 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests, curl_cffi') elif ( re.match(r'unsupported url scheme: "wss?"', ue.msg.lower()) @@ -4120,6 +4154,13 @@ def urlopen(self, req): 'This request requires WebSocket support. ' 'Ensure one of the following dependencies are installed: websockets', cause=ue) from ue + + elif re.match(r'unsupported (?:extensions: impersonate|impersonate target)', ue.msg.lower()): + raise RequestError( + f'Impersonate target "{req.extensions["impersonate"]}" is not available.' + f' See --list-impersonate-targets for available targets.' + f' This request requires browser impersonation, however you may be missing dependencies' + f' required to support this target.') raise except SSLError as e: if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): @@ -4152,6 +4193,7 @@ def build_request_director(self, handlers, preferences=None): 'timeout': 'socket_timeout', 'legacy_ssl_support': 'legacyserverconnect', 'enable_file_urls': 'enable_file_urls', + 'impersonate': 'impersonate', 'client_cert': { 'client_certificate': 'client_certificate', 'client_certificate_key': 'client_certificate_key', diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index aeea2625ef1d..940594fafb8b 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -19,6 +19,7 @@ from .downloader.external import get_external_downloader from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO +from .networking.impersonate import ImpersonateTarget from .options import parseOpts from .postprocessor import ( FFmpegExtractAudioPP, @@ -48,6 +49,7 @@ float_or_none, format_field, int_or_none, + join_nonempty, match_filter_func, parse_bytes, parse_duration, @@ -388,6 +390,9 @@ def parse_chapters(name, value, advanced=False): f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') opts.cookiesfrombrowser = (browser_name, profile, keyring, container) + if opts.impersonate is not None: + opts.impersonate = ImpersonateTarget.from_str(opts.impersonate.lower()) + # MetadataParser def metadataparser_actions(f): if isinstance(f, str): @@ -911,6 +916,7 @@ def parse_options(argv=None): 'postprocessors': postprocessors, 'fixup': opts.fixup, 'source_address': opts.source_address, + 'impersonate': opts.impersonate, 'call_home': opts.call_home, 'sleep_interval_requests': opts.sleep_interval_requests, 'sleep_interval': opts.sleep_interval, @@ -980,6 +986,41 @@ def _real_main(argv=None): traceback.print_exc() ydl._download_retcode = 100 + if opts.list_impersonate_targets: + + known_targets = [ + # List of simplified targets we know are supported, + # to help users know what dependencies may be required. + (ImpersonateTarget('chrome'), 'curl_cffi'), + (ImpersonateTarget('edge'), 'curl_cffi'), + (ImpersonateTarget('safari'), 'curl_cffi'), + ] + + available_targets = ydl._get_available_impersonate_targets() + + def make_row(target, handler): + return [ + join_nonempty(target.client.title(), target.version, delim='-') or '-', + join_nonempty((target.os or "").title(), target.os_version, delim='-') or '-', + handler, + ] + + rows = [make_row(target, handler) for target, handler in available_targets] + + for known_target, known_handler in known_targets: + if not any( + known_target in target and handler == known_handler + for target, handler in available_targets + ): + rows.append([ + ydl._format_out(text, ydl.Styles.SUPPRESS) + for text in make_row(known_target, f'{known_handler} (not available)') + ]) + + ydl.to_screen('[info] Available impersonate targets') + ydl.to_stdout(render_table(['Client', 'OS', 'Source'], rows, extra_gap=2, delim='-')) + return + if not actual_use: if pre_process: return ydl._download_retcode diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py index 7c3dbfb66bea..8e7f42f5969d 100644 --- a/yt_dlp/__pyinstaller/hook-yt_dlp.py +++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -1,6 +1,6 @@ import sys -from PyInstaller.utils.hooks import collect_submodules +from PyInstaller.utils.hooks import collect_submodules, collect_data_files def pycryptodome_module(): @@ -25,10 +25,12 @@ def get_hidden_imports(): for module in ('websockets', 'requests', 'urllib3'): yield from collect_submodules(module) # These are auto-detected, but explicitly add them just in case - yield from ('mutagen', 'brotli', 'certifi', 'secretstorage') + yield from ('mutagen', 'brotli', 'certifi', 'secretstorage', 'curl_cffi') hiddenimports = list(get_hidden_imports()) print(f'Adding imports: {hiddenimports}') excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts', 'bundle'] + +datas = collect_data_files('curl_cffi', includes=['cacert.pem']) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 28d174a09f12..85d6dd18232e 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,6 +1,7 @@ import base64 import collections import contextlib +import datetime as dt import glob import http.cookiejar import http.cookies @@ -15,7 +16,6 @@ import tempfile import time import urllib.request -from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -594,7 +594,7 @@ def skip_to_end(self, description='unknown'): def _mac_absolute_time_to_posix(timestamp): - return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=timestamp)).timestamp()) + return int((dt.datetime(2001, 1, 1, 0, 0, tzinfo=dt.timezone.utc) + dt.timedelta(seconds=timestamp)).timestamp()) def _parse_safari_cookies_header(data, logger): diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py index 9e3f90724e38..0d58da2bd5bf 100644 --- a/yt_dlp/dependencies/__init__.py +++ b/yt_dlp/dependencies/__init__.py @@ -74,6 +74,10 @@ if hasattr(xattr, 'set'): # pyxattr xattr._yt_dlp__identifier = 'pyxattr' +try: + import curl_cffi +except ImportError: + curl_cffi = None from . import Cryptodome diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ce5eeb0a9a27..8b0b94e72560 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -491,7 +491,7 @@ def _call_downloader(self, tmpfilename, info_dict): if not self.params.get('verbose'): args += ['-hide_banner'] - args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[]) + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args', ...)) # These exists only for compatibility. Extractors should use # info_dict['downloader_options']['ffmpeg_args'] instead @@ -615,6 +615,8 @@ def _call_downloader(self, tmpfilename, info_dict): else: args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args_out', ...)) + args += self._configuration_args(('_o1', '_o', '')) args = [encodeArgument(opt) for opt in args] diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c75365536804..2ad5801c442a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -150,6 +150,7 @@ ) from .arnes import ArnesIE from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE +from .asobistage import AsobiStageIE from .atresplayer import AtresPlayerIE from .atscaleconf import AtScaleConfEventIE from .atvat import ATVAtIE @@ -590,6 +591,7 @@ FacebookReelIE, FacebookAdsIE, ) +from .fathom import FathomIE from .fancode import ( FancodeVodIE, FancodeLiveIE @@ -989,6 +991,10 @@ LnkGoIE, LnkIE, ) +from .loom import ( + LoomIE, + LoomFolderIE, +) from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, @@ -1750,6 +1756,7 @@ ShahidIE, ShahidShowIE, ) +from .sharepoint import SharePointIE from .sharevideos import ShareVideosEmbedIE from .sibnet import SibnetEmbedIE from .shemaroome import ShemarooMeIE @@ -2283,6 +2290,7 @@ VrtNUIE, KetnetIE, DagelijkseKostIE, + Radio1BeIE, ) from .vtm import VTMIE from .medialaan import MedialaanIE diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 46e68d61e260..3db59c5ca973 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -1,5 +1,5 @@ +import functools import re -from functools import partial from .common import InfoExtractor from ..utils import ( @@ -349,7 +349,7 @@ def _extract_episode_info(self, title): r'(?P.*)', ] - return traverse_obj(patterns, (..., {partial(re.match, string=title)}, { + return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, { 'season_number': ('season_number', {int_or_none}), 'episode_number': ('episode_number', {int_or_none}), 'episode': (( diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py new file mode 100644 index 000000000000..b088a1b1321a --- /dev/null +++ b/yt_dlp/extractor/asobistage.py @@ -0,0 +1,154 @@ +import functools + +from .common import InfoExtractor +from ..utils import str_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class AsobiStageIE(InfoExtractor): + IE_DESC = 'ASOBISTAGE (ć‚¢ć‚½ćƒ“ć‚¹ćƒ†ćƒ¼ć‚ø)' + _VALID_URL = r'https?://asobistage\.asobistore\.jp/event/(?P(?P\w+)/(?Parchive|player)/(?P\w+))(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://asobistage.asobistore.jp/event/315passionhour_2022summer/archive/frame', + 'info_dict': { + 'id': '315passionhour_2022summer/archive/frame', + 'title': '315惗惭惀ć‚Æć‚·ćƒ§ćƒ³ćƒ—ćƒ¬ć‚¼ćƒ³ćƒ„ 315ćƒ‘ćƒƒć‚·ćƒ§ćƒ³ć‚¢ćƒÆćƒ¼!!!', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'edff52f2', + 'ext': 'mp4', + 'title': '315passion_FRAME_only', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/idolmaster_idolworld2023_goods/archive/live', + 'info_dict': { + 'id': 'idolmaster_idolworld2023_goods/archive/live', + 'title': 'md5:378510b6e830129d505885908bd6c576', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '3aef7110', + 'ext': 'mp4', + 'title': 'asobistore_station_1020_serverREC', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/sidem_fclive_bpct/archive/premium_hc', + 'playlist_count': 4, + 'info_dict': { + 'id': 'sidem_fclive_bpct/archive/premium_hc', + 'title': '315 Production presents Fļ¼ NTASTIC COMBINATION LIVE ļ½žBRAINPOWER!!ļ½ž/ļ½žCONNECTIME!!!!ļ½ž', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }, { + 'url': 'https://asobistage.asobistore.jp/event/ijigenfes_utagassen/player/day1', + 'only_matching': True, + }] + + _API_HOST = 'https://asobistage-api.asobistore.jp' + _HEADERS = {} + _is_logged_in = False + + @functools.cached_property + def _owned_tickets(self): + owned_tickets = set() + if not self._is_logged_in: + return owned_tickets + + for path, name in [ + ('api/v1/purchase_history/list', 'ticket purchase history'), + ('api/v1/serialcode/list', 'redemption history'), + ]: + response = self._download_json( + f'{self._API_HOST}/{path}', None, f'Downloading {name}', + f'Unable to download {name}', expected_status=400) + if traverse_obj(response, ('payload', 'error_message'), 'error') == 'notlogin': + self._is_logged_in = False + break + owned_tickets.update( + traverse_obj(response, ('payload', 'value', ..., 'digital_product_id', {str_or_none}))) + + return owned_tickets + + def _get_available_channel_id(self, channel): + channel_id = traverse_obj(channel, ('chennel_vspf_id', {str})) + if not channel_id: + return None + # if rights_type_id == 6, then 'No conditions (no login required - non-members are OK)' + if traverse_obj(channel, ('viewrights', lambda _, v: v['rights_type_id'] == 6)): + return channel_id + available_tickets = traverse_obj(channel, ( + 'viewrights', ..., ('tickets', 'serialcodes'), ..., 'digital_product_id', {str_or_none})) + if not self._owned_tickets.intersection(available_tickets): + self.report_warning( + f'You are not a ticketholder for "{channel.get("channel_name") or channel_id}"') + return None + return channel_id + + def _real_initialize(self): + if self._get_cookies(self._API_HOST): + self._is_logged_in = True + token = self._download_json( + f'{self._API_HOST}/api/v1/vspf/token', None, 'Getting token', 'Unable to get token') + self._HEADERS['Authorization'] = f'Bearer {token}' + + def _real_extract(self, url): + video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug') + video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] + webpage = self._download_webpage(url, video_id) + event_data = traverse_obj( + self._search_nextjs_data(webpage, video_id, default='{}'), + ('props', 'pageProps', 'eventCMSData', { + 'title': ('event_name', {str}), + 'thumbnail': ('event_thumbnail_image', {url_or_none}), + })) + + available_channels = traverse_obj(self._download_json( + f'https://asobistage.asobistore.jp/cdn/v101/events/{event}/{video_type}.json', + video_id, 'Getting channel list', 'Unable to get channel list'), ( + video_type, lambda _, v: v['broadcast_slug'] == slug, + 'channels', lambda _, v: v['chennel_vspf_id'] != '00000')) + + entries = [] + for channel_id in traverse_obj(available_channels, (..., {self._get_available_channel_id})): + if video_type == 'archives': + channel_json = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{channel_id}/get_by_cuid', channel_id, + 'Getting archive channel info', 'Unable to get archive channel info', fatal=False, + headers=self._HEADERS) + channel_data = traverse_obj(channel_json, ('ex_content', { + 'm3u8_url': 'streaming_url', + 'title': 'title', + 'thumbnail': ('thumbnail', 'url'), + })) + else: # video_type == 'broadcasts' + channel_json = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{channel_id}', channel_id, + 'Getting live channel info', 'Unable to get live channel info', fatal=False, + headers=self._HEADERS, query={'embed': 'channel'}) + channel_data = traverse_obj(channel_json, ('data', { + 'm3u8_url': ('Channel', 'Custom_live_url'), + 'title': 'Name', + 'thumbnail': 'Poster_url', + })) + + entries.append({ + 'id': channel_id, + 'title': channel_data.get('title'), + 'formats': self._extract_m3u8_formats(channel_data.get('m3u8_url'), channel_id, fatal=False), + 'is_live': video_type == 'broadcasts', + 'thumbnail': url_or_none(channel_data.get('thumbnail')), + }) + + if not self._is_logged_in and not entries: + self.raise_login_required() + + return self.playlist_result(entries, video_id, **event_data) diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py index d6ed9e49586f..d60feba3159a 100644 --- a/yt_dlp/extractor/atvat.py +++ b/yt_dlp/extractor/atvat.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt from .common import InfoExtractor from ..utils import ( @@ -71,9 +71,9 @@ def _real_extract(self, url): content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']} for id, content in enumerate(contentResource)] - time_of_request = datetime.datetime.now() - not_before = time_of_request - datetime.timedelta(minutes=5) - expire = time_of_request + datetime.timedelta(minutes=5) + time_of_request = dt.datetime.now() + not_before = time_of_request - dt.timedelta(minutes=5) + expire = time_of_request + dt.timedelta(minutes=5) payload = { 'content_ids': { content_id: content_ids, diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py index c4741a6a1197..4ebef9295739 100644 --- a/yt_dlp/extractor/aws.py +++ b/yt_dlp/extractor/aws.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import hashlib import hmac @@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _aws_execute_api(self, aws_dict, video_id, query=None): query = query or {} - amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ') + amz_date = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ') date = amz_date[:8] headers = { 'Accept': 'application/json', diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index 34464daa1aa9..666b51c56a7d 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,4 +1,4 @@ -from functools import partial +import functools from .common import InfoExtractor from ..utils import ( @@ -50,7 +50,7 @@ def _extract_base_info(data): **traverse_obj(data, { 'title': 'title', 'description': 'description', - 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), 'timestamp': ('schedulingStart', {parse_iso8601}), 'season_number': 'seasonNumber', 'episode_number': 'episodeNumber', diff --git a/yt_dlp/extractor/box.py b/yt_dlp/extractor/box.py index 7281b3c6a6af..008c011cc8aa 100644 --- a/yt_dlp/extractor/box.py +++ b/yt_dlp/extractor/box.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_iso8601, update_url_query, url_or_none, @@ -11,8 +12,8 @@ class BoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P[^/?#]+)/file/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P[^/?#]+)(?:/file/(?P\d+))?' + _TESTS = [{ 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', 'info_dict': { @@ -25,14 +26,36 @@ class BoxIE(InfoExtractor): 'uploader_id': '235196876', }, 'params': {'skip_download': 'dash fragment too small'}, - } + }, { + 'url': 'https://utexas.app.box.com/s/2x6vanv85fdl8j2eqlcxmv0gp1wvps6e', + 'info_dict': { + 'id': '787379022466', + 'ext': 'mp4', + 'title': 'Webinar recording: Take the Leap!.mp4', + 'uploader': 'Patricia Mosele', + 'timestamp': 1615824864, + 'upload_date': '20210315', + 'uploader_id': '239068974', + }, + 'params': {'skip_download': 'dash fragment too small'}, + }] def _real_extract(self, url): shared_name, file_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, file_id) - request_token = self._parse_json(self._search_regex( - r'Box\.config\s*=\s*({.+?});', webpage, - 'Box config'), file_id)['requestToken'] + webpage = self._download_webpage(url, file_id or shared_name) + + if not file_id: + post_stream_data = self._search_json( + r'Box\.postStreamData\s*=', webpage, 'Box post-stream data', shared_name) + shared_item = traverse_obj( + post_stream_data, ('/app-api/enduserapp/shared-item', {dict})) or {} + if shared_item.get('itemType') != 'file': + raise ExtractorError('The requested resource is not a file', expected=True) + + file_id = str(shared_item['itemID']) + + request_token = self._search_json( + r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken'] access_token = self._download_json( 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, 'Downloading token JSON metadata', diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py index 9fd7c7de185b..71f7726659c5 100644 --- a/yt_dlp/extractor/bundestag.py +++ b/yt_dlp/extractor/bundestag.py @@ -1,5 +1,5 @@ +import functools import re -from functools import partial from .common import InfoExtractor from ..networking.exceptions import HTTPError @@ -115,9 +115,9 @@ def _real_extract(self, url): note='Downloading metadata overlay', fatal=False, ), { 'title': ( - {partial(get_element_text_and_html_by_tag, 'h3')}, 0, - {partial(re.sub, r']*>[^<]+', '')}, {clean_html}), - 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {functools.partial(re.sub, r']*>[^<]+', '')}, {clean_html}), + 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), })) return result diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index b5beb1ec8cdc..ff320dd68388 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -151,7 +151,7 @@ def _real_extract(self, url): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -165,9 +165,52 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, 'skip': 'Geo-restricted to Canada and no longer available', + }, { + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'md5:dd3b692f0a139b0369943150bd1c46a9', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, + 'categories': ['AudioMobile/All in a Weekend Montreal'], + 'tags': 'count:8', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', + }, + }, { + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['News/Canada/Windsor'], + 'location': 'Windsor', + 'tags': ['cancer'], + 'creators': ['Allison Johnson'], + 'media_type': 'Excerpt', + }, }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ - 'url': 'http://www.cbc.ca/player/play/2657631896', + 'url': 'https://www.cbc.ca/player/play/1.2985700', 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', 'info_dict': { 'id': '2657631896', @@ -189,7 +232,7 @@ class CBCPlayerIE(InfoExtractor): 'media_type': 'Excerpt', }, }, { - 'url': 'http://www.cbc.ca/player/play/2164402062', + 'url': 'https://www.cbc.ca/player/play/1.1711287', 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', @@ -206,38 +249,53 @@ class CBCPlayerIE(InfoExtractor): 'categories': ['News/Canada/Windsor'], 'location': 'Windsor', 'tags': ['cancer'], - 'creator': 'Allison Johnson', + 'creators': ['Allison Johnson'], 'media_type': 'Excerpt', }, }, { # Has subtitles # These broadcasts expire after ~1 month, can find new test URL here: # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast - 'url': 'http://www.cbc.ca/player/play/2284799043667', - 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5', + 'url': 'https://www.cbc.ca/player/play/1.7159484', + 'md5': '6ed6cd0fc2ef568d2297ba68a763d455', 'info_dict': { - 'id': '2284799043667', + 'id': '2324213316001', 'ext': 'mp4', - 'title': 'The National | Hockey coach charged, Green grants, Safer drugs', - 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa', - 'timestamp': 1700272800, - 'duration': 2718.833, + 'title': 'The National | School boards sue social media giants', + 'description': 'md5:4b4db69322fa32186c3ce426da07402c', + 'timestamp': 1711681200, + 'duration': 2743.400, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg', 'uploader': 'CBCC-NEW', 'chapters': 'count:5', - 'upload_date': '20231118', + 'upload_date': '20240329', 'categories': 'count:4', 'series': 'The National - Full Show', 'tags': 'count:1', - 'creator': 'News', + 'creators': ['News'], 'location': 'Canada', 'media_type': 'Full Program', }, + }, { + 'url': 'cbcplayer:1.7159484', + 'only_matching': True, + }, { + 'url': 'cbcplayer:2164402062', + 'only_matching': True, + }, { + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '.' in video_id: + webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) + video_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, + 'initial state', video_id)['video']['currentClip']['mediaId'] + return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 1157114b2ad9..90b4d082e2d3 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -1,6 +1,6 @@ import base64 import codecs -import datetime +import datetime as dt import hashlib import hmac import json @@ -134,7 +134,7 @@ def _perform_login(self, username, password): self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})' cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} - if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: + if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5: self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' return @@ -154,7 +154,7 @@ def _perform_login(self, username, password): }) self.cache.store(self._BEARER_CACHE, username, { 'token': token_res['access_token'], - 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(), + 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(), }) self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e776ccae9249..57bbf9bdf1e6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -37,6 +37,7 @@ IncompleteRead, network_exceptions, ) +from ..networking.impersonate import ImpersonateTarget from ..utils import ( IDENTITY, JSON_LD_RE, @@ -170,12 +171,12 @@ class InfoExtractor: Automatically calculated from width and height * dynamic_range The dynamic range of the video. One of: "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s + * tbr Average bitrate of audio and video in kbps (1000 bits/sec) + * abr Average audio bitrate in kbps (1000 bits/sec) * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * audio_channels Number of audio channels - * vbr Average video bitrate in KBit/s + * vbr Average video bitrate in kbps (1000 bits/sec) * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format @@ -246,7 +247,8 @@ class InfoExtractor: * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads - * ffmpeg_args Extra arguments for ffmpeg downloader + * ffmpeg_args Extra arguments for ffmpeg downloader (input) + * ffmpeg_args_out Extra arguments for ffmpeg downloader (output) * is_dash_periods Whether the format is a result of merging multiple DASH periods. RTMP formats can also have the additional fields: page_url, @@ -817,7 +819,7 @@ def __can_accept_status_code(err, expected_status): else: return err.status in variadic(expected_status) - def _create_request(self, url_or_request, data=None, headers=None, query=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None): if isinstance(url_or_request, urllib.request.Request): self._downloader.deprecation_warning( 'Passing a urllib.request.Request to _create_request() is deprecated. ' @@ -826,10 +828,11 @@ def _create_request(self, url_or_request, data=None, headers=None, query=None): elif not isinstance(url_or_request, Request): url_or_request = Request(url_or_request) - url_or_request.update(data=data, headers=headers, query=query) + url_or_request.update(data=data, headers=headers, query=query, extensions=extensions) return url_or_request - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, + headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False): """ Return the response handle. @@ -860,8 +863,31 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa headers = (headers or {}).copy() headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) + extensions = {} + + if impersonate in (True, ''): + impersonate = ImpersonateTarget() + requested_targets = [ + t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) + for t in variadic(impersonate) + ] if impersonate else [] + + available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None) + if available_target: + extensions['impersonate'] = available_target + elif requested_targets: + message = 'The extractor is attempting impersonation, but ' + message += ( + 'no impersonate target is available' if not str(impersonate) + else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"') + info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation ' + 'for information on installing the required dependencies') + if require_impersonation: + raise ExtractorError(f'{message}; {info_msg}', expected=True) + self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True) + try: - return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions)) except network_exceptions as err: if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): @@ -880,13 +906,14 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa return False def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, - encoding=None, data=None, headers={}, query={}, expected_status=None): + encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): """ Return a tuple (page content as string, URL handle). Arguments: url_or_request -- plain text URL as a string or - a urllib.request.Request object + a yt_dlp.networking.Request object video_id -- Video/playlist/item identifier (string) Keyword arguments: @@ -911,13 +938,22 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= returning True if it should be accepted Note that this argument does not affect success status codes (2xx) which are always accepted. + impersonate -- the impersonate target. Can be any of the following entities: + - an instance of yt_dlp.networking.impersonate.ImpersonateTarget + - a string in the format of CLIENT[:OS] + - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances + - a boolean value; True means any impersonate target is sufficient + require_impersonation -- flag to toggle whether the request should raise an error + if impersonation is not possible (bool, default: False) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, + headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if urlh is False: assert not fatal return False @@ -1046,17 +1082,20 @@ def parse(ie, content, *args, errnote=errnote, **kwargs): return getattr(ie, parser)(content, *args, **kwargs) def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): res = self._download_webpage_handle( url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, expected_status=expected_status) + data=data, headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if res is False: return res content, urlh = res return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) filename = self._request_dump_filename(url_or_request.url, video_id) @@ -1079,6 +1118,8 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, 'headers': headers, 'query': query, 'expected_status': expected_status, + 'impersonate': impersonate, + 'require_impersonation': require_impersonation, } if parser is None: kwargs.pop('transform_source') diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py index bb06c42befa8..5ea014cf05de 100644 --- a/yt_dlp/extractor/dtube.py +++ b/yt_dlp/extractor/dtube.py @@ -1,5 +1,5 @@ import json -from socket import timeout +import socket from .common import InfoExtractor from ..utils import ( @@ -56,7 +56,7 @@ def canonical_url(h): try: self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) self._downloader._opener.open(video_url, timeout=5).close() - except timeout: + except socket.timeout: self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, format_id)) continue diff --git a/yt_dlp/extractor/fathom.py b/yt_dlp/extractor/fathom.py new file mode 100644 index 000000000000..1df7d96fe894 --- /dev/null +++ b/yt_dlp/extractor/fathom.py @@ -0,0 +1,54 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + get_element_html_by_id, + parse_iso8601, +) +from ..utils.traversal import traverse_obj + + +class FathomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fathom\.video/share/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://fathom.video/share/G9mkjkspnohVVZ_L5nrsoPycyWcB8y7s', + 'md5': '0decd5343b8f30ae268625e79a02b60f', + 'info_dict': { + 'id': '47200596', + 'ext': 'mp4', + 'title': 'eCom Inucbator - Coaching Session', + 'duration': 8125.380507, + 'timestamp': 1699048914, + 'upload_date': '20231103', + }, + }, { + 'url': 'https://fathom.video/share/mEws3bybftHL2QLymxYEDeE21vtLxGVm', + 'md5': '4f5cb382126c22d1aba8a939f9c49690', + 'info_dict': { + 'id': '46812957', + 'ext': 'mp4', + 'title': 'Jon, Lawrence, Neman chat about practice', + 'duration': 3571.517847, + 'timestamp': 1698933600, + 'upload_date': '20231102', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + props = traverse_obj( + get_element_html_by_id('app', webpage), ({extract_attributes}, 'data-page', {json.loads}, 'props')) + video_id = str(props['call']['id']) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(props['call']['video_url'], video_id, 'mp4'), + **traverse_obj(props, { + 'title': ('head', 'title', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('call', 'started_at', {parse_iso8601}), + }), + } diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 74aad1192750..7a98e0f31c07 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -1,6 +1,6 @@ import base64 import binascii -import datetime +import datetime as dt import hashlib import hmac import json @@ -422,7 +422,7 @@ def __get_current_timestamp(): months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - time_now = datetime.datetime.now(datetime.timezone.utc) + time_now = dt.datetime.now(dt.timezone.utc) format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) time_string = time_now.strftime(format_string) return time_string diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index 1fa0a2a7918d..f32c116bb12b 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -76,6 +76,23 @@ class ImgurIE(ImgurBaseIE): 'thumbnail': 'https://i.imgur.com/jxBXAMCh.jpg', 'dislike_count': int, }, + }, { + # needs Accept header, ref: https://github.com/yt-dlp/yt-dlp/issues/9458 + 'url': 'https://imgur.com/zV03bd5', + 'md5': '59df97884e8ba76143ff6b640a0e2904', + 'info_dict': { + 'id': 'zV03bd5', + 'ext': 'mp4', + 'title': 'Ive - Liz', + 'timestamp': 1710491255, + 'upload_date': '20240315', + 'like_count': int, + 'dislike_count': int, + 'duration': 56.92, + 'comment_count': int, + 'release_timestamp': 1710491255, + 'release_date': '20240315', + }, }] def _real_extract(self, url): @@ -192,6 +209,7 @@ def og_get_size(media_type): 'id': video_id, 'formats': formats, 'thumbnail': url_or_none(search('thumbnailUrl')), + 'http_headers': {'Accept': '*/*'}, } diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index a59209835998..1131ac0d47aa 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -2,6 +2,7 @@ from ..utils import ( int_or_none, js_to_json, + orderedSet, url_or_none, urlencode_postdata, urljoin, @@ -24,13 +25,14 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'md5': '3b84396d15ed9e083c3106f1fa589c04', 'info_dict': { 'id': 'OQsEfQFVUXk', - 'ext': 'mp4', + 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', 'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', 'duration': 205, 'view_count': int, 'release_year': 2018, + 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi', 'Rashmi Virag', 'Irshad Kamil'], }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', @@ -61,9 +63,10 @@ def _real_extract(self, url): if not media_data.get('auth_url'): self.report_warning(f'Unable to extract format info for {bitrate}') continue + ext = media_data.get('type') formats.append({ 'url': media_data['auth_url'], - 'ext': media_data.get('type'), + 'ext': 'm4a' if ext == 'mp4' else ext, 'format_id': bitrate, 'abr': int(bitrate), 'vcodec': 'none', @@ -79,6 +82,7 @@ def _real_extract(self, url): 'duration': ('duration', {int_or_none}), 'view_count': ('play_count', {int_or_none}), 'release_year': ('year', {int_or_none}), + 'artists': ('artists', ..., 'name', {str}, all, {orderedSet}), }), } diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py index 3bb28af94e12..c68ad8cb5f4b 100644 --- a/yt_dlp/extractor/joqrag.py +++ b/yt_dlp/extractor/joqrag.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import urllib.parse from .common import InfoExtractor @@ -50,8 +50,8 @@ def _extract_metadata(self, variable, html): def _extract_start_timestamp(self, video_id, is_live): def extract_start_time_from(date_str): - dt = datetime_from_str(date_str) + datetime.timedelta(hours=9) - date = dt.strftime('%Y%m%d') + dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9) + date = dt_.strftime('%Y%m%d') start_time = self._search_regex( r']+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+ā€“\s*(\d{1,2}:\d{1,2})', self._download_webpage( @@ -60,7 +60,7 @@ def extract_start_time_from(date_str): errnote=f'Failed to download program list of {date}') or '', 'start time', default=None) if start_time: - return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00') + return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00') return None start_timestamp = extract_start_time_from('today') @@ -87,7 +87,7 @@ def _real_extract(self, url): msg = 'This stream is not currently live' if release_timestamp: msg += (' and will start at ' - + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + + dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) self.raise_no_formats(msg, expected=True) else: m3u8_path = self._search_regex( diff --git a/yt_dlp/extractor/leeco.py b/yt_dlp/extractor/leeco.py index 85033b8f8b48..5d61a607f7cf 100644 --- a/yt_dlp/extractor/leeco.py +++ b/yt_dlp/extractor/leeco.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import hashlib import re import time @@ -185,7 +185,7 @@ def get_flash_urls(media_url, format_id): publish_time = parse_iso8601(self._html_search_regex( r'发åøƒę—¶é—“ ([^<>]+) ', page, 'publish time', default=None), - delimiter=' ', timezone=datetime.timedelta(hours=8)) + delimiter=' ', timezone=dt.timedelta(hours=8)) description = self._html_search_meta('description', page, fatal=False) return { diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index ad41c0e20f4f..e12f467ef51c 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,4 +1,4 @@ -from itertools import zip_longest +import itertools import re from .common import InfoExtractor @@ -156,7 +156,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def json2srt(self, transcript_lines, duration=None): srt_data = '' - for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + for line, (line_dict, next_dict) in enumerate(itertools.zip_longest(transcript_lines, transcript_lines[1:])): start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time), diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py new file mode 100644 index 000000000000..1191aa17ea8d --- /dev/null +++ b/yt_dlp/extractor/loom.py @@ -0,0 +1,461 @@ +import json +import textwrap +import urllib.parse +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + filter_dict, + get_first, + int_or_none, + parse_iso8601, + update_url, + url_or_none, + variadic, +) +from ..utils.traversal import traverse_obj + + +class LoomIE(InfoExtractor): + IE_NAME = 'loom' + _VALID_URL = r'https?://(?:www\.)?loom\.com/(?:share|embed)/(?P[\da-f]{32})' + _EMBED_REGEX = [rf']+\bsrc=["\'](?P{_VALID_URL})'] + _TESTS = [{ + # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, json subs only + 'url': 'https://www.loom.com/share/43d05f362f734614a2e81b4694a3a523', + 'md5': 'bfc2d7e9c2e0eb4813212230794b6f42', + 'info_dict': { + 'id': '43d05f362f734614a2e81b4694a3a523', + 'ext': 'mp4', + 'title': 'A Ruler for Windows - 28 March 2022', + 'uploader': 'wILLIAM PIP', + 'upload_date': '20220328', + 'timestamp': 1648454238, + 'duration': 27, + }, + }, { + # webm raw-url, mp4 transcoded-url, cdn url == transcoded-url, no subs + 'url': 'https://www.loom.com/share/c43a642f815f4378b6f80a889bb73d8d', + 'md5': '70f529317be8cf880fcc2c649a531900', + 'info_dict': { + 'id': 'c43a642f815f4378b6f80a889bb73d8d', + 'ext': 'webm', + 'title': 'Lilah Nielsen Intro Video', + 'uploader': 'Lilah Nielsen', + 'upload_date': '20200826', + 'timestamp': 1598480716, + 'duration': 20, + }, + }, { + # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, vtt sub and json subs + 'url': 'https://www.loom.com/share/9458bcbf79784162aa62ffb8dd66201b', + 'md5': '51737ec002969dd28344db4d60b9cbbb', + 'info_dict': { + 'id': '9458bcbf79784162aa62ffb8dd66201b', + 'ext': 'mp4', + 'title': 'Sharing screen with gpt-4', + 'description': 'Sharing screen with GPT 4 vision model and asking questions to guide through blender.', + 'uploader': 'Suneel Matham', + 'chapters': 'count:3', + 'upload_date': '20231109', + 'timestamp': 1699518978, + 'duration': 93, + }, + }, { + # mpd raw-url, mp4 transcoded-url, cdn url == raw-url, no subs + 'url': 'https://www.loom.com/share/24351eb8b317420289b158e4b7e96ff2', + 'info_dict': { + 'id': '24351eb8b317420289b158e4b7e96ff2', + 'ext': 'webm', + 'title': 'OMFG clown', + 'description': 'md5:285c5ee9d62aa087b7e3271b08796815', + 'uploader': 'MrPumkin B', + 'upload_date': '20210924', + 'timestamp': 1632519618, + 'duration': 210, + }, + 'params': {'skip_download': 'dash'}, + }, { + # password-protected + 'url': 'https://www.loom.com/share/50e26e8aeb7940189dff5630f95ce1f4', + 'md5': '5cc7655e7d55d281d203f8ffd14771f7', + 'info_dict': { + 'id': '50e26e8aeb7940189dff5630f95ce1f4', + 'ext': 'mp4', + 'title': 'iOS Mobile Upload', + 'uploader': 'Simon Curran', + 'upload_date': '20200520', + 'timestamp': 1590000123, + 'duration': 35, + }, + 'params': {'videopassword': 'seniorinfants2'}, + }, { + # embed, transcoded-url endpoint sends empty JSON response + 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', + 'md5': '8488817242a0db1cb2ad0ea522553cf6', + 'info_dict': { + 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', + 'ext': 'mp4', + 'title': 'CF Reset User\'s Password', + 'uploader': 'Aimee Heintz', + 'upload_date': '20220707', + 'timestamp': 1657216459, + 'duration': 181, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.loom.com/community/e1229802a8694a09909e8ba0fbb6d073-pg', + 'md5': 'ec838cd01b576cf0386f32e1ae424609', + 'info_dict': { + 'id': 'e1229802a8694a09909e8ba0fbb6d073', + 'ext': 'mp4', + 'title': 'Rexie Jane Cimafranca - Founder\'s Presentation', + 'uploader': 'Rexie Cimafranca', + 'upload_date': '20230213', + 'duration': 247, + 'timestamp': 1676274030, + }, + }] + + _GRAPHQL_VARIABLES = { + 'GetVideoSource': { + 'acceptableMimes': ['DASH', 'M3U8', 'MP4'], + }, + } + _GRAPHQL_QUERIES = { + 'GetVideoSSR': textwrap.dedent('''\ + query GetVideoSSR($videoId: ID!, $password: String) { + getVideo(id: $videoId, password: $password) { + __typename + ... on PrivateVideo { + id + status + message + __typename + } + ... on VideoPasswordMissingOrIncorrect { + id + message + __typename + } + ... on RegularUserVideo { + id + __typename + createdAt + description + download_enabled + folder_id + is_protected + needs_password + owner { + display_name + __typename + } + privacy + s3_id + name + video_properties { + avgBitRate + client + camera_enabled + client_version + duration + durationMs + format + height + microphone_enabled + os + os_version + recordingClient + recording_type + recording_version + screen_type + tab_audio + trim_duration + width + __typename + } + playable_duration + source_duration + visibility + } + } + }\n'''), + 'GetVideoSource': textwrap.dedent('''\ + query GetVideoSource($videoId: ID!, $password: String, $acceptableMimes: [CloudfrontVideoAcceptableMime]) { + getVideo(id: $videoId, password: $password) { + ... on RegularUserVideo { + id + nullableRawCdnUrl(acceptableMimes: $acceptableMimes, password: $password) { + url + __typename + } + __typename + } + __typename + } + }\n'''), + 'FetchVideoTranscript': textwrap.dedent('''\ + query FetchVideoTranscript($videoId: ID!, $password: String) { + fetchVideoTranscript(videoId: $videoId, password: $password) { + ... on VideoTranscriptDetails { + id + video_id + source_url + captions_source_url + __typename + } + ... on GenericError { + message + __typename + } + __typename + } + }\n'''), + 'FetchChapters': textwrap.dedent('''\ + query FetchChapters($videoId: ID!, $password: String) { + fetchVideoChapters(videoId: $videoId, password: $password) { + ... on VideoChapters { + video_id + content + __typename + } + ... on EmptyChaptersPayload { + content + __typename + } + ... on InvalidRequestWarning { + message + __typename + } + ... on Error { + message + __typename + } + __typename + } + }\n'''), + } + _APOLLO_GRAPHQL_VERSION = '0a1856c' + + def _call_graphql_api(self, operations, video_id, note=None, errnote=None): + password = self.get_param('videopassword') + return self._download_json( + 'https://www.loom.com/graphql', video_id, note or 'Downloading GraphQL JSON', + errnote or 'Failed to download GraphQL JSON', headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'x-loom-request-source': f'loom_web_{self._APOLLO_GRAPHQL_VERSION}', + 'apollographql-client-name': 'web', + 'apollographql-client-version': self._APOLLO_GRAPHQL_VERSION, + }, data=json.dumps([{ + 'operationName': operation_name, + 'variables': { + 'videoId': video_id, + 'password': password, + **self._GRAPHQL_VARIABLES.get(operation_name, {}), + }, + 'query': self._GRAPHQL_QUERIES[operation_name], + } for operation_name in variadic(operations)], separators=(',', ':')).encode()) + + def _call_url_api(self, endpoint, video_id): + response = self._download_json( + f'https://www.loom.com/api/campaigns/sessions/{video_id}/{endpoint}', video_id, + f'Downloading {endpoint} JSON', f'Failed to download {endpoint} JSON', fatal=False, + headers={'Accept': 'application/json', 'Content-Type': 'application/json'}, + data=json.dumps({ + 'anonID': str(uuid.uuid4()), + 'deviceID': None, + 'force_original': False, # HTTP error 401 if True + 'password': self.get_param('videopassword'), + }, separators=(',', ':')).encode()) + return traverse_obj(response, ('url', {url_or_none})) + + def _extract_formats(self, video_id, metadata, gql_data): + formats = [] + video_properties = traverse_obj(metadata, ('video_properties', { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'acodec': ('microphone_enabled', {lambda x: 'none' if x is False else None}), + })) + + def get_formats(format_url, format_id, quality): + if not format_url: + return + ext = determine_ext(format_url) + query = urllib.parse.urlparse(format_url).query + + if ext == 'm3u8': + # Extract pre-merged HLS formats to avoid buggy parsing of metadata in split playlists + format_url = format_url.replace('-split.m3u8', '.m3u8') + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) + for fmt in m3u8_formats: + yield { + **fmt, + 'url': update_url(fmt['url'], query=query), + 'extra_param_to_segment_url': query, + } + + elif ext == 'mpd': + dash_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id=f'dash-{format_id}', fatal=False) + for fmt in dash_formats: + yield { + **fmt, + 'extra_param_to_segment_url': query, + 'quality': quality, + } + + else: + yield { + 'url': format_url, + 'ext': ext, + 'format_id': f'http-{format_id}', + 'quality': quality, + **video_properties, + } + + raw_url = self._call_url_api('raw-url', video_id) + formats.extend(get_formats(raw_url, 'raw', quality=1)) # original quality + + transcoded_url = self._call_url_api('transcoded-url', video_id) + formats.extend(get_formats(transcoded_url, 'transcoded', quality=-1)) # transcoded quality + + cdn_url = get_first(gql_data, ('data', 'getVideo', 'nullableRawCdnUrl', 'url', {url_or_none})) + # cdn_url is usually a dupe, but the raw-url/transcoded-url endpoints could return errors + valid_urls = [update_url(url, query=None) for url in (raw_url, transcoded_url) if url] + if cdn_url and update_url(cdn_url, query=None) not in valid_urls: + formats.extend(get_formats(cdn_url, 'cdn', quality=0)) # could be original or transcoded + + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = get_first( + self._call_graphql_api('GetVideoSSR', video_id, 'Downloading GraphQL metadata JSON'), + ('data', 'getVideo', {dict})) or {} + + if metadata.get('__typename') == 'VideoPasswordMissingOrIncorrect': + if not self.get_param('videopassword'): + raise ExtractorError( + 'This video is password-protected, use the --video-password option', expected=True) + raise ExtractorError('Invalid video password', expected=True) + + gql_data = self._call_graphql_api(['FetchChapters', 'FetchVideoTranscript', 'GetVideoSource'], video_id) + duration = traverse_obj(metadata, ('video_properties', 'duration', {int_or_none})) + + return { + 'id': video_id, + 'duration': duration, + 'chapters': self._extract_chapters_from_description( + get_first(gql_data, ('data', 'fetchVideoChapters', 'content', {str})), duration) or None, + 'formats': self._extract_formats(video_id, metadata, gql_data), + 'subtitles': filter_dict({ + 'en': traverse_obj(gql_data, ( + ..., 'data', 'fetchVideoTranscript', + ('source_url', 'captions_source_url'), { + 'url': {url_or_none}, + })) or None, + }), + **traverse_obj(metadata, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'uploader': ('owner', 'display_name', {str}), + 'timestamp': ('createdAt', {parse_iso8601}), + }), + } + + +class LoomFolderIE(InfoExtractor): + IE_NAME = 'loom:folder' + _VALID_URL = r'https?://(?:www\.)?loom\.com/share/folder/(?P[\da-f]{32})' + _TESTS = [{ + # 2 subfolders, no videos in root + 'url': 'https://www.loom.com/share/folder/997db4db046f43e5912f10dc5f817b5c', + 'playlist_mincount': 16, + 'info_dict': { + 'id': '997db4db046f43e5912f10dc5f817b5c', + 'title': 'Blending Lessons', + }, + }, { + # only videos, no subfolders + 'url': 'https://www.loom.com/share/folder/9a8a87f6b6f546d9a400c8e7575ff7f2', + 'playlist_mincount': 12, + 'info_dict': { + 'id': '9a8a87f6b6f546d9a400c8e7575ff7f2', + 'title': 'List A- a, i, o', + }, + }, { + # videos in root and empty subfolder + 'url': 'https://www.loom.com/share/folder/886e534218c24fd292e97e9563078cc4', + 'playlist_mincount': 21, + 'info_dict': { + 'id': '886e534218c24fd292e97e9563078cc4', + 'title': 'Medicare Agent Training videos', + }, + }, { + # videos in root and videos in subfolders + 'url': 'https://www.loom.com/share/folder/b72c4ecdf04745da9403926d80a40c38', + 'playlist_mincount': 21, + 'info_dict': { + 'id': 'b72c4ecdf04745da9403926d80a40c38', + 'title': 'Quick Altos Q & A Tutorials', + }, + }, { + # recursive folder extraction + 'url': 'https://www.loom.com/share/folder/8b458a94e0e4449b8df9ea7a68fafc4e', + 'playlist_count': 23, + 'info_dict': { + 'id': '8b458a94e0e4449b8df9ea7a68fafc4e', + 'title': 'Sezer Texting Guide', + }, + }, { + # more than 50 videos in 1 folder + 'url': 'https://www.loom.com/share/folder/e056a91d290d47ca9b00c9d1df56c463', + 'playlist_mincount': 61, + 'info_dict': { + 'id': 'e056a91d290d47ca9b00c9d1df56c463', + 'title': 'User Videos', + }, + }, { + # many subfolders + 'url': 'https://www.loom.com/share/folder/c2dde8cc67454f0e99031677279d8954', + 'playlist_mincount': 75, + 'info_dict': { + 'id': 'c2dde8cc67454f0e99031677279d8954', + 'title': 'Honors 1', + }, + }, { + 'url': 'https://www.loom.com/share/folder/bae17109a68146c7803454f2893c8cf8/Edpuzzle', + 'only_matching': True, + }] + + def _extract_folder_data(self, folder_id): + return self._download_json( + f'https://www.loom.com/v1/folders/{folder_id}', folder_id, + 'Downloading folder info JSON', query={'limit': '10000'}) + + def _extract_folder_entries(self, folder_id, initial_folder_data=None): + folder_data = initial_folder_data or self._extract_folder_data(folder_id) + + for video in traverse_obj(folder_data, ('videos', lambda _, v: v['id'])): + video_id = video['id'] + yield self.url_result( + f'https://www.loom.com/share/{video_id}', LoomIE, video_id, video.get('name')) + + # Recurse into subfolders + for subfolder_id in traverse_obj(folder_data, ( + 'folders', lambda _, v: v['id'] != folder_id, 'id', {str})): + yield from self._extract_folder_entries(subfolder_id) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._extract_folder_data(playlist_id) + + return self.playlist_result( + self._extract_folder_entries(playlist_id, playlist_data), playlist_id, + traverse_obj(playlist_data, ('folder', 'name', {str.strip}))) diff --git a/yt_dlp/extractor/masters.py b/yt_dlp/extractor/masters.py index 716f1c9615a2..c3c58d7d01fb 100644 --- a/yt_dlp/extractor/masters.py +++ b/yt_dlp/extractor/masters.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( traverse_obj, diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py index 9b50996b703e..5f5f160876c4 100644 --- a/yt_dlp/extractor/microsoftstream.py +++ b/yt_dlp/extractor/microsoftstream.py @@ -1,4 +1,4 @@ -from base64 import b64decode +import base64 from .common import InfoExtractor from ..utils import ( @@ -81,7 +81,7 @@ def _real_extract(self, url): 'url': thumbnail_url, } thumb_name = url_basename(thumbnail_url) - thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) thumb.update(parse_resolution(thumb_name)) thumbnails.append(thumb) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 4be6947289c4..b980fd01a82d 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,5 +1,7 @@ from .common import InfoExtractor -from ..utils import UserNotLive, traverse_obj +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none +from ..utils.traversal import traverse_obj class MixchIE(InfoExtractor): @@ -25,25 +27,23 @@ class MixchIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) - - initial_js_state = self._parse_json(self._search_regex( - r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) - if not initial_js_state.get('liveInfo'): + data = self._download_json(f'https://mixch.tv/api-web/users/{video_id}/live', video_id) + if not traverse_obj(data, ('liveInfo', {dict})): raise UserNotLive(video_id=video_id) return { 'id': video_id, - 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), - 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), - 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), - 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), - 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), 'uploader_id': video_id, + **traverse_obj(data, { + 'title': ('liveInfo', 'title', {str}), + 'comment_count': ('liveInfo', 'comments', {int_or_none}), + 'view_count': ('liveInfo', 'visitor', {int_or_none}), + 'timestamp': ('liveInfo', 'created', {int_or_none}), + 'uploader': ('broadcasterInfo', 'name', {str}), + }), 'formats': [{ 'format_id': 'hls', - 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls')) - or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'), + 'url': data['liveInfo']['hls'], 'ext': 'mp4', 'protocol': 'm3u8', }], @@ -60,22 +60,38 @@ class MixchArchiveIE(InfoExtractor): 'skip': 'paid video, no DRM. expires at Jan 23', 'info_dict': { 'id': '421', + 'ext': 'mp4', 'title': '96NEKO SHOW TIME', } + }, { + 'url': 'https://mixch.tv/archive/1213', + 'skip': 'paid video, no DRM. expires at Dec 31, 2023', + 'info_dict': { + 'id': '1213', + 'ext': 'mp4', + 'title': '怐ē‰¹åˆ„ćƒˆćƒ¼ć‚Æē•Ŗēµ„ć‚¢ćƒ¼ć‚«ć‚¤ćƒ–ć‚¹ć€‘Merm4idƗē‡čˆžę›² 2nd LIVE怌VERSUS怍', + 'release_date': '20231201', + 'thumbnail': str, + } + }, { + 'url': 'https://mixch.tv/archive/1214', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - html5_videos = self._parse_html5_media_entries( - url, webpage.replace('video-js', 'video'), video_id, 'hls') - if not html5_videos: - self.raise_login_required(method='cookies') - infodict = html5_videos[0] - infodict.update({ - 'id': video_id, - 'title': self._html_search_regex(r'class="archive-title">(.+?)[^/?#]{{46}})/?(?:$|[?#])', + rf'{_BASE_URL_RE}(?!:v:)(?:[^/?#]+/)*stream\.aspx\?(?:[^#]+&)?id=(?P[^&#]+)', + ] + _TESTS = [{ + 'url': 'https://lut-my.sharepoint.com/:v:/g/personal/juha_eerola_student_lab_fi/EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw?e=ZpQOOw', + 'md5': '2950821d0d4937a0a76373782093b435', + 'info_dict': { + 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB', + 'display_id': 'EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw', + 'ext': 'mp4', + 'title': 'CmvpJST', + 'duration': 54.567, + 'thumbnail': r're:https://.+/thumbnail', + 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f', + }, + }, { + 'url': 'https://greaternyace.sharepoint.com/:v:/s/acementornydrive/ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg?e=PQUfVb', + 'md5': 'c496a01644223273bff12e93e501afd1', + 'info_dict': { + 'id': '01QI4AVTZ3ESFZPAD42VCKB5CZKAGLFVYB', + 'display_id': 'ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg', + 'ext': 'mp4', + 'title': '930103681233985536', + 'duration': 3797.326, + 'thumbnail': r're:https://.+/thumbnail', + }, + }, { + 'url': 'https://lut-my.sharepoint.com/personal/juha_eerola_student_lab_fi/_layouts/15/stream.aspx?id=%2Fpersonal%2Fjuha_eerola_student_lab_fi%2FDocuments%2FM-DL%2FCmvpJST.mp4&ga=1&referrer=StreamWebApp.Web&referrerScenario=AddressBarCopied.view', + 'info_dict': { + 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB', + 'display_id': '/personal/juha_eerola_student_lab_fi/Documents/M-DL/CmvpJST.mp4', + 'ext': 'mp4', + 'title': 'CmvpJST', + 'duration': 54.567, + 'thumbnail': r're:https://.+/thumbnail', + 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f', + }, + 'skip': 'Session cookies needed', + }, { + 'url': 'https://izoobasisschool.sharepoint.com/:v:/g/Eaqleq8COVBIvIPvod0U27oBypC6aWOkk8ptuDpmJ6arHw', + 'only_matching': True, + }, { + 'url': 'https://uskudaredutr-my.sharepoint.com/:v:/g/personal/songul_turkaydin_uskudar_edu_tr/EbTf-VRUIbtGuIN73tx1MuwBCHBOmNcWNqSLw61Fd2_o0g?e=n5Vkof', + 'only_matching': True, + }, { + 'url': 'https://epam-my.sharepoint.com/:v:/p/dzmitry_tamashevich/Ec4ZOs-rATZHjFYZWVxjczEB649FCoYFKDV_x3RxZiWAGA?e=4hswgA', + 'only_matching': True, + }, { + 'url': 'https://microsoft.sharepoint.com/:v:/t/MicrosoftSPARKRecordings-MSFTInternal/EWCyeqByVWBAt8wDvNZdV-UB0BvU5YVbKm0UHgdrUlI6dg?e=QbPck6', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = urllib.parse.unquote(self._match_id(url)) + webpage, urlh = self._download_webpage_handle(url, display_id) + if urllib.parse.urlparse(urlh.url).hostname == 'login.microsoftonline.com': + self.raise_login_required( + 'Session cookies are required for this URL and can be passed ' + 'with the --cookies option. The --cookies-from-browser option will not work', method=None) + + video_data = self._search_json(r'g_fileInfo\s*=', webpage, 'player config', display_id) + video_id = video_data['VroomItemId'] + + parsed_url = urllib.parse.urlparse(video_data['.transformUrl']) + base_media_url = urllib.parse.urlunparse(parsed_url._replace( + path=urllib.parse.urljoin(f'{parsed_url.path}/', '../videomanifest'), + query=urllib.parse.urlencode({ + **urllib.parse.parse_qs(parsed_url.query), + 'cTag': video_data['.ctag'], + 'action': 'Access', + 'part': 'index', + }, doseq=True))) + + # Web player adds more params to the format URLs but we still get all formats without them + formats = self._extract_mpd_formats( + base_media_url, video_id, mpd_id='dash', query={'format': 'dash'}, fatal=False) + for hls_type in ('hls', 'hls-vnext'): + formats.extend(self._extract_m3u8_formats( + base_media_url, video_id, 'mp4', m3u8_id=hls_type, + query={'format': hls_type}, fatal=False, quality=-2)) + + if video_url := traverse_obj(video_data, ('downloadUrl', {url_or_none})): + formats.append({ + 'url': video_url, + 'ext': determine_ext(video_data.get('extension') or video_data.get('name')), + 'quality': 1, + 'format_id': 'source', + 'filesize': int_or_none(video_data.get('size')), + 'vcodec': 'none' if video_data.get('isAudio') is True else None, + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data.get('title') or video_data.get('displayName'), + 'display_id': display_id, + 'uploader_id': video_data.get('authorId'), + 'duration': traverse_obj(video_data, ( + 'MediaServiceFastMetadata', {json.loads}, 'media', 'duration', {lambda x: x / 10000000})), + 'thumbnail': url_or_none(video_data.get('thumbnailUrl')), + } diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index a6da44525088..7c914acbed23 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import itertools import json import math @@ -94,7 +94,7 @@ def _perform_login(self, username, password): 'mobileNumber': username, 'channelPartnerID': 'MSMIND', 'country': 'IN', - 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), 'otpSize': 6, 'loginType': 'REGISTERORSIGNIN', 'isMobileMandatory': True, @@ -111,7 +111,7 @@ def _perform_login(self, username, password): 'otp': self._get_tfa_info('OTP'), 'dmaId': 'IN', 'ageConfirmation': True, - 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), 'isMobileMandatory': True, }).encode()) if otp_verify_json['resultCode'] == 'KO': diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index a7c2afd49792..c9ed645eb701 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -1,30 +1,27 @@ import itertools -import re import json -# import random +import re -from .common import ( - InfoExtractor, - SearchInfoExtractor -) +from .common import InfoExtractor, SearchInfoExtractor from ..compat import compat_str -from ..networking import HEADRequest, Request +from ..networking import HEADRequest from ..networking.exceptions import HTTPError from ..utils import ( - error_to_compat_str, + KNOWN_EXTENSIONS, ExtractorError, + error_to_compat_str, float_or_none, int_or_none, - KNOWN_EXTENSIONS, mimetype2ext, parse_qs, str_or_none, - try_get, + try_call, unified_timestamp, update_url_query, url_or_none, urlhandle_detect_ext, ) +from ..utils.traversal import traverse_obj class SoundcloudEmbedIE(InfoExtractor): @@ -54,7 +51,6 @@ class SoundcloudBaseIE(InfoExtractor): _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' - _access_token = None _HEADERS = {} _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' @@ -112,21 +108,31 @@ def _download_json(self, *args, **kwargs): def _initialize_pre_login(self): self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' + def _verify_oauth_token(self, token): + if self._request_webpage( + self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), + None, note='Verifying login token...', fatal=False, + data=json.dumps({'session': {'access_token': token}}).encode()): + self._HEADERS['Authorization'] = f'OAuth {token}' + self.report_login() + else: + self.report_warning('Provided authorization token is invalid. Continuing as guest') + + def _real_initialize(self): + if self._HEADERS: + return + if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value): + self._verify_oauth_token(token) + def _perform_login(self, username, password): if username != 'oauth': - self.report_warning( + raise ExtractorError( 'Login using username and password is not currently supported. ' - 'Use "--username oauth --password " to login using an oauth token') - self._access_token = password - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - payload = {'session': {'access_token': self._access_token}} - token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) - if response is not False: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} - self.report_login() - else: - self.report_warning('Provided authorization token seems to be invalid. Continue as guest') + 'Use "--username oauth --password " to login using an oauth token, ' + f'or else {self._login_hint(method="cookies")}', expected=True) + if self._HEADERS: + return + self._verify_oauth_token(password) r''' def genDevId(): @@ -147,14 +153,17 @@ def genNumBlock(): 'user_agent': self._USER_AGENT } - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(login, None) - self._access_token = response.get('session').get('access_token') - if not self._access_token: - self.report_warning('Unable to get access token, login may has failed') - else: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + response = self._download_json( + self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), + None, note='Verifying login token...', fatal=False, + data=json.dumps(payload).encode()) + + if token := traverse_obj(response, ('session', 'access_token', {str})): + self._HEADERS['Authorization'] = f'OAuth {token}' + self.report_login() + return + + raise ExtractorError('Unable to get access token, login may have failed', expected=True) ''' # signature generation @@ -217,6 +226,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f 'filesize': int_or_none(urlh.headers.get('Content-Length')), 'url': format_url, 'quality': 10, + 'format_note': 'Original', }) def invalid_url(url): @@ -233,9 +243,13 @@ def add_format(f, protocol, is_preview=False): format_id_list.append(protocol) ext = f.get('ext') if ext == 'aac': - f['abr'] = '256' + f.update({ + 'abr': 256, + 'quality': 5, + 'format_note': 'Premium', + }) for k in ('ext', 'abr'): - v = f.get(k) + v = str_or_none(f.get(k)) if v: format_id_list.append(v) preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) @@ -256,16 +270,25 @@ def add_format(f, protocol, is_preview=False): formats.append(f) # New API - transcodings = try_get( - info, lambda x: x['media']['transcodings'], list) or [] - for t in transcodings: - if not isinstance(t, dict): - continue - format_url = url_or_none(t.get('url')) - if not format_url: - continue - stream = None if extract_flat else self._download_json( - format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))): + if extract_flat: + break + format_url = t['url'] + stream = None + + for retry in self.RetryManager(fatal=False): + try: + stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 429: + self.report_warning( + 'You have reached the API rate limit, which is ~600 requests per ' + '10 minutes. Use the --extractor-retries and --retry-sleep options ' + 'to configure an appropriate retry count and wait time', only_once=True) + retry.error = e.cause + else: + self.report_warning(e.msg) + if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) diff --git a/yt_dlp/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py index 9378ed0214ab..5fdcddd8b328 100644 --- a/yt_dlp/extractor/telewebion.py +++ b/yt_dlp/extractor/telewebion.py @@ -1,8 +1,7 @@ from __future__ import annotations - +import functools import json -from functools import partial -from textwrap import dedent +import textwrap from .common import InfoExtractor from ..utils import ExtractorError, format_field, int_or_none, parse_iso8601 @@ -10,7 +9,7 @@ def _fmt_url(url): - return partial(format_field, template=url, default=None) + return functools.partial(format_field, template=url, default=None) class TelewebionIE(InfoExtractor): @@ -88,7 +87,7 @@ def _real_extract(self, url): if not video_id.startswith('0x'): video_id = hex(int(video_id)) - episode_data = self._call_graphql_api('getEpisodeDetail', video_id, dedent(''' + episode_data = self._call_graphql_api('getEpisodeDetail', video_id, textwrap.dedent(''' queryEpisode(filter: {EpisodeID: $EpisodeId}, first: 1) { title program { @@ -127,7 +126,7 @@ def _real_extract(self, url): 'formats': ( 'channel', 'descriptor', {str}, {_fmt_url(f'https://cdna.telewebion.com/%s/episode/{video_id}/playlist.m3u8')}, - {partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}), + {functools.partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}), })) info_dict['id'] = video_id return info_dict diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index a98275d8628c..11cc5705e953 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,7 +1,7 @@ import base64 +import datetime as dt import functools import itertools -from datetime import datetime from .common import InfoExtractor from ..networking import HEADRequest @@ -70,7 +70,7 @@ def _get_bearer_token(self, video_id): username, password = self._get_login_info() if username is None or password is None: self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = datetime.now().strftime('%Y%m%d000000') + _timestamp = dt.datetime.now().strftime('%Y%m%d000000') _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ 'X-Network-Ten-Auth': _auth_header, diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index 15f8380d3678..fbc12d55d901 100644 --- a/yt_dlp/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py @@ -1,5 +1,6 @@ import json +from .brightcove import BrightcoveNewIE from .common import InfoExtractor from .zype import ZypeIE from ..networking import HEADRequest @@ -8,6 +9,7 @@ ExtractorError, filter_dict, parse_qs, + smuggle_url, try_call, urlencode_postdata, ) @@ -17,23 +19,43 @@ class ThisOldHouseIE(InfoExtractor): _NETRC_MACHINE = 'thisoldhouse' _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P[^/?#]+)' _TESTS = [{ + # Unresolved Brightcove URL embed (formerly Zype), free 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', 'info_dict': { - 'id': '5dcdddf673c3f956ef5db202', + 'id': '6325298523112', 'ext': 'mp4', 'title': 'How to Build a Storage Bench', 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', - 'timestamp': 1442548800, - 'upload_date': '20150918', - 'duration': 674, - 'view_count': int, - 'average_rating': 0, - 'thumbnail': r're:^https?://.*\.jpg\?\d+$', - 'display_id': 'how-to-build-a-storage-bench', + 'timestamp': 1681793639, + 'upload_date': '20230418', + 'duration': 674.54, + 'tags': 'count:11', + 'uploader_id': '6314471934001', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': True, }, + }, { + # Brightcove embed, authwalled + 'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational', + 'info_dict': { + 'id': '6349675446112', + 'ext': 'mp4', + 'title': 'E17 | Glen Ridge Generational | Multi-Generational', + 'description': 'md5:53c6bc2e8031f3033d693d9a3563222c', + 'timestamp': 1711382202, + 'upload_date': '20240325', + 'duration': 1422.229, + 'tags': 'count:13', + 'uploader_id': '6314471934001', + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'expected_warnings': ['Login with password is not supported for this website'], + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires subscription', }, { # Page no longer has video 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', @@ -98,7 +120,15 @@ def _real_extract(self, url): video_url, video_id = self._search_regex( r']+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', - webpage, 'video url', group=(1, 2)) - video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + webpage, 'zype url', group=(1, 2), default=(None, None)) + if video_url: + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + return self.url_result(video_url, ZypeIE, video_id) - return self.url_result(video_url, ZypeIE, video_id) + video_url, video_id = self._search_regex([ + r']+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))', + r']+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'], + webpage, 'iframe url', group=(1, 2)) + if not parse_qs(video_url).get('videoId'): + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url + return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 02545bc79ccc..295e14932a87 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -4,6 +4,7 @@ import re import string import time +import uuid from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse @@ -30,19 +31,65 @@ class TikTokBaseIE(InfoExtractor): - _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')] - _WORKING_APP_VERSION = None - _APP_NAME = 'trill' - _AID = 1180 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + _APP_INFO_DEFAULTS = { + # unique "install id" + 'iid': None, + # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme + 'app_name': 'musical_ly', + 'app_version': '34.1.2', + 'manifest_app_version': '2023401020', + # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 + 'aid': '0', + } + _KNOWN_APP_INFO = [ + '7351144126450059040', + '7351149742343391009', + '7351153174894626592', + ] + _APP_INFO_POOL = None + _APP_INFO = None + _APP_USER_AGENT = None + @property def _API_HOSTNAME(self): return self._configuration_arg( 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] + def _get_next_app_info(self): + if self._APP_INFO_POOL is None: + defaults = { + key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0] + for key, default in self._APP_INFO_DEFAULTS.items() + if key != 'iid' + } + app_info_list = ( + self._configuration_arg('app_info', ie_key=TikTokIE) + or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO))) + self._APP_INFO_POOL = [ + {**defaults, **dict( + (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v + )} for app_info in app_info_list + ] + + if not self._APP_INFO_POOL: + return False + + self._APP_INFO = self._APP_INFO_POOL.pop(0) + + app_name = self._APP_INFO['app_name'] + version = self._APP_INFO['manifest_app_version'] + if app_name == 'musical_ly': + package = f'com.zhiliaoapp.musically/{version}' + else: # trill, aweme + package = f'com.ss.android.ugc.{app_name}/{version}' + self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)' + + return True + @staticmethod def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' @@ -58,7 +105,7 @@ def _get_universal_data(self, webpage, display_id): 'universal data', display_id, end_pattern=r'', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} - def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, + def _call_api_impl(self, ep, query, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) @@ -67,80 +114,84 @@ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', + 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', }, query=query) - def _build_api_query(self, query, app_version, manifest_app_version): + def _build_api_query(self, query): return { **query, - 'version_name': app_version, - 'version_code': manifest_app_version, - 'build_number': app_version, - 'manifest_version_code': manifest_app_version, - 'update_version_code': manifest_app_version, - 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), - 'uuid': ''.join(random.choices(string.digits, k=16)), - '_rticket': int(time.time() * 1000), - 'ts': int(time.time()), - 'device_brand': 'Google', - 'device_type': 'Pixel 7', 'device_platform': 'android', + 'os': 'android', + 'ssmix': 'a', + '_rticket': int(time.time() * 1000), + 'cdid': str(uuid.uuid4()), + 'channel': 'googleplay', + 'aid': self._APP_INFO['aid'], + 'app_name': self._APP_INFO['app_name'], + 'version_code': ''.join((f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.'))), + 'version_name': self._APP_INFO['app_version'], + 'manifest_version_code': self._APP_INFO['manifest_app_version'], + 'update_version_code': self._APP_INFO['manifest_app_version'], + 'ab_version': self._APP_INFO['app_version'], 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '13', + 'device_type': 'Pixel 7', + 'device_brand': 'Google', + 'language': 'en', 'os_api': '29', - 'carrier_region': 'US', + 'os_version': '13', + 'ac': 'wifi', + 'is_pad': '0', + 'current_region': 'US', + 'app_type': 'normal', 'sys_region': 'US', - 'region': 'US', - 'app_name': self._APP_NAME, - 'app_language': 'en', - 'language': 'en', + 'last_install_time': int(time.time()) - random.randint(86400, 1123200), 'timezone_name': 'America/New_York', + 'residence': 'US', + 'app_language': 'en', 'timezone_offset': '-14400', - 'channel': 'googleplay', - 'ac': 'wifi', - 'mcc_mnc': '310260', - 'is_my_cn': 0, - 'aid': self._AID, - 'ssmix': 'a', - 'as': 'a1qwert123', - 'cp': 'cbfhckdckkde1', + 'host_abi': 'armeabi-v7a', + 'locale': 'en', + 'ac2': 'wifi5g', + 'uoo': '1', + 'op_region': 'US', + 'build_number': self._APP_INFO['app_version'], + 'region': 'US', + 'ts': int(time.time()), + 'iid': self._APP_INFO['iid'], + 'device_id': random.randint(7250000000000000000, 7351147085025500000), + 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), } def _call_api(self, ep, query, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): - if not self._WORKING_APP_VERSION: - app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0] - manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0] - if app_version and manifest_app_version: - self._WORKING_APP_VERSION = (app_version, manifest_app_version) - self.write_debug('Imported app version combo from extractor arguments') - elif app_version or manifest_app_version: - self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True) - - if self._WORKING_APP_VERSION: - app_version, manifest_app_version = self._WORKING_APP_VERSION - real_query = self._build_api_query(query, app_version, manifest_app_version) - return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) - - for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1): - real_query = self._build_api_query(query, app_version, manifest_app_version) + if not self._APP_INFO and not self._get_next_app_info(): + message = 'No working app info is available' + if fatal: + raise ExtractorError(message, expected=True) + else: + self.report_warning(message) + return + + max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO + for count in itertools.count(1): + self.write_debug(str(self._APP_INFO)) + real_query = self._build_api_query(query) try: - res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) - self._WORKING_APP_VERSION = (app_version, manifest_app_version) - return res + return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: - if count == len(self._APP_VERSIONS): + message = str(e.cause or e.msg) + if not self._get_next_app_info(): if fatal: - raise e + raise else: - self.report_warning(str(e.cause or e.msg)) + self.report_warning(message) return - self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS))) + self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})') continue - raise e + raise def _extract_aweme_app(self, aweme_id): feed_list = self._call_api( @@ -223,6 +274,7 @@ def audio_meta(url): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) + is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2' if res: known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height'))) known_resolutions[res].setdefault('width', int_or_none(addr.get('width'))) @@ -235,8 +287,11 @@ def extract_addr(addr, add_meta={}): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, + # bytevc2 is bytedance's proprietary (unplayable) video codec + 'preference': -100 if is_bytevc2 else -1, 'format_note': join_nonempty( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '), + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, + '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '), **audio_meta(url), } for url in addr.get('url_list') or []] diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index c55786a0dce1..80cba09155db 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -191,17 +191,25 @@ def _get_thumbnails(self, thumbnail): }] if thumbnail else None def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): - return self._extract_m3u8_formats( + formats = self._extract_m3u8_formats( f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={ 'allow_source': 'true', 'allow_audio_only': 'true', 'allow_spectre': 'true', 'p': random.randint(1000000, 10000000), + 'platform': 'web', 'player': 'twitchweb', + 'supported_codecs': 'av1,h265,h264', 'playlist_include_framerate': 'true', 'sig': signature, 'token': token, }) + for fmt in formats: + if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'): + # mpegts does not yet have proper support for av1 + fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']} + + return formats class TwitchVodIE(TwitchBaseIE): diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 497233d95f8c..3d26549a4036 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -16,6 +16,7 @@ join_nonempty, jwt_encode_hs256, make_archive_id, + merge_dicts, parse_age_limit, parse_iso8601, str_or_none, @@ -425,3 +426,64 @@ def _real_extract(self, url): ['description', 'twitter:description', 'og:description'], webpage), '_old_archive_ids': [make_archive_id('Canvas', video_id)], } + + +class Radio1BeIE(VRTBaseIE): + _VALID_URL = r'https?://radio1\.be/(?:lees|luister/select)/(?P[\w/-]+)' + _TESTS = [{ + 'url': 'https://radio1.be/luister/select/de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie', + 'info_dict': { + 'id': 'eb6c22e9-544f-44f4-af39-cf8cccd29e22', + 'title': 'Komt N-VA volgend jaar op in WalloniĆ«?', + 'display_id': 'de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie', + 'description': 'md5:b374ea1c9302f38362df9dea1931468e', + 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+' + }, + 'playlist_mincount': 1 + }, { + 'url': 'https://radio1.be/lees/europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza?view=web', + 'info_dict': { + 'id': '5d47f102-dbdb-4fa0-832b-26c1870311f2', + 'title': 'Europese Unie wil "onmiddellijke humanitaire pauze" en "duurzaam staakt-het-vuren" in Gaza', + 'description': 'md5:1aad1fae7d39edeffde5d3e67d276b64', + 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+', + 'display_id': 'europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza' + }, + 'playlist_mincount': 1 + }] + + def _extract_video_entries(self, next_js_data, display_id): + video_data = traverse_obj( + next_js_data, ((None, ('paragraphs', ...)), {lambda x: x if x['mediaReference'] else None})) + for data in video_data: + media_reference = data['mediaReference'] + formats, subtitles = self._extract_formats_and_subtitles( + self._call_api(media_reference), display_id) + + yield { + 'id': media_reference, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('body', {clean_html}) + }), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + next_js_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['item'] + + return self.playlist_result( + self._extract_video_entries(next_js_data, display_id), **merge_dicts(traverse_obj( + next_js_data, ({ + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': (('description', 'content'), {clean_html}), + }), get_all=False), { + 'display_id': display_id, + 'title': self._html_search_meta(['name', 'og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage), + })) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index bce5e8326be0..f2256fdc6035 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -1,6 +1,6 @@ +import base64 import re import urllib.parse -from base64 import b64decode from .common import InfoExtractor from ..networking import HEADRequest @@ -371,7 +371,7 @@ def _real_extract(self, url): webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) data = self._parse_json( self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id), - channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8'))) + channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8'))) # XXX: can there be more than one series? series = traverse_obj(data, ('series', 0), default={}) diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 5df071503a99..59eef8490ff6 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -15,35 +15,35 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?xvideos2?\.com/video| - (?:www\.)?xvideos\.es/video| + (?:[^/]+\.)?xvideos2?\.com/video\.?| + (?:www\.)?xvideos\.es/video\.?| (?:www|flashservice)\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) - (?P[0-9]+) + (?P[0-9a-z]+) ''' _TESTS = [{ - 'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf', - 'md5': '14cea69fcb84db54293b1e971466c2e1', + 'url': 'http://xvideos.com/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex', + 'md5': '396255a900a6bddb3e98985f0b86c3fd', 'info_dict': { - 'id': '4588838', + 'id': 'ucuvbkfda4e', 'ext': 'mp4', - 'title': 'Motorcycle Guy Cucks Influencer, Steals his GF', - 'duration': 108, + 'title': 'A Beautiful Red-Haired Stranger Was Refused, But Still Came To My Room For Sex', + 'duration': 1238, 'age_limit': 18, - 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', + 'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg', } }, { # Broken HLS formats 'url': 'https://www.xvideos.com/video65982001/what_s_her_name', - 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5', + 'md5': '56742808292c8fa1418e4538c262c58b', 'info_dict': { 'id': '65982001', 'ext': 'mp4', 'title': 'what\'s her name?', 'duration': 120, 'age_limit': 18, - 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', + 'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg', } }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', @@ -90,6 +90,18 @@ class XVideosIE(InfoExtractor): }, { 'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl', 'only_matching': True + }, { + 'url': 'https://flashservice.xvideos.com/embedframe/ucuvbkfda4e', + 'only_matching': True, + }, { + 'url': 'https://www.xvideos.com/embedframe/ucuvbkfda4e', + 'only_matching': True, + }, { + 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=ucuvbkfda4e', + 'only_matching': True, + }, { + 'url': 'https://xvideos.es/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex', + 'only_matching': True }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 33fd3b490020..e553fff9f171 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,7 +2,7 @@ import calendar import collections import copy -import datetime +import datetime as dt import enum import hashlib import itertools @@ -33,6 +33,7 @@ clean_html, datetime_from_str, dict_get, + filesize_from_tbr, filter_dict, float_or_none, format_field, @@ -55,6 +56,7 @@ str_to_int, strftime_or_none, traverse_obj, + try_call, try_get, unescapeHTML, unified_strdate, @@ -922,10 +924,10 @@ def extract_relative_time(relative_time_text): def _parse_time_text(self, text): if not text: return - dt = self.extract_relative_time(text) + dt_ = self.extract_relative_time(text) timestamp = None - if isinstance(dt, datetime.datetime): - timestamp = calendar.timegm(dt.timetuple()) + if isinstance(dt_, dt.datetime): + timestamp = calendar.timegm(dt_.timetuple()) if timestamp is None: timestamp = ( @@ -3602,8 +3604,8 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if _split_innertube_client(client)[0] == 'android': - yt_query['params'] = 'CgIQBg==' + if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'): + yt_query['params'] = 'CgIIAQ==' pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] if pp_arg: @@ -3839,11 +3841,12 @@ def build_fragments(f): 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) + format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. # E.g. __2ABJjxzNo, ySuUZEjARPY - is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) + is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) @@ -3873,6 +3876,7 @@ def build_fragments(f): 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, + 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(audio_track.get('id', '').split('.')[0], @@ -4564,7 +4568,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'): # Newly uploaded videos' HLS formats are potentially problematic and need to be checked - upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc) + upload_datetime = datetime_from_str(upload_date).replace(tzinfo=dt.timezone.utc) if upload_datetime >= datetime_from_str('today-2days'): for fmt in info['formats']: if fmt.get('protocol') == 'm3u8_native': @@ -6965,7 +6969,7 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only _TESTS = [{ 'url': 'ytsearch5:youtube-dl test video', 'playlist_count': 5, @@ -6973,6 +6977,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', } + }, { + 'note': 'Suicide/self-harm search warning', + 'url': 'ytsearch1:i hate myself and i wanna die', + 'playlist_count': 1, + 'info_dict': { + 'id': 'i hate myself and i wanna die', + 'title': 'i hate myself and i wanna die', + } }] @@ -6980,7 +6992,7 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube search, newest videos first' - _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date _TESTS = [{ 'url': 'ytsearchdate5:youtube-dl test video', 'playlist_count': 5, diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 6bd9ea064e9b..5cc9c5f7a12f 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -1,5 +1,5 @@ import re -from uuid import uuid4 +import uuid from .common import InfoExtractor from ..compat import compat_str @@ -53,7 +53,7 @@ def _initialize_pre_login(self): self._request_webpage( '%s/zapi/v3/session/hello' % self._host_url(), None, 'Opening session', data=urlencode_postdata({ - 'uuid': compat_str(uuid4()), + 'uuid': compat_str(uuid.uuid4()), 'lang': 'en', 'app_version': '1.8.2', 'format': 'json', diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index acadc0147d59..356712c761ae 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -28,3 +28,10 @@ pass except Exception as e: warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message()) + +try: + from . import _curlcffi # noqa: F401 +except ImportError: + pass +except Exception as e: + warnings.warn(f'Failed to import "curl_cffi" request handler: {e}' + bug_reports_message()) diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py new file mode 100644 index 000000000000..39d1f70fb053 --- /dev/null +++ b/yt_dlp/networking/_curlcffi.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +import io +import math +import urllib.parse + +from ._helper import InstanceStoreMixin, select_proxy +from .common import ( + Features, + Request, + Response, + register_preference, + register_rh, +) +from .exceptions import ( + CertificateVerifyError, + HTTPError, + IncompleteRead, + ProxyError, + SSLError, + TransportError, +) +from .impersonate import ImpersonateRequestHandler, ImpersonateTarget +from ..dependencies import curl_cffi +from ..utils import int_or_none + +if curl_cffi is None: + raise ImportError('curl_cffi is not installed') + +curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.')) + +if curl_cffi_version != (0, 5, 10): + curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' + raise ImportError('Only curl_cffi 0.5.10 is supported') + +import curl_cffi.requests +from curl_cffi.const import CurlECode, CurlOpt + + +class CurlCFFIResponseReader(io.IOBase): + def __init__(self, response: curl_cffi.requests.Response): + self._response = response + self._iterator = response.iter_content() + self._buffer = b'' + self.bytes_read = 0 + + def readable(self): + return True + + def read(self, size=None): + exception_raised = True + try: + while self._iterator and (size is None or len(self._buffer) < size): + chunk = next(self._iterator, None) + if chunk is None: + self._iterator = None + break + self._buffer += chunk + self.bytes_read += len(chunk) + + if size is None: + size = len(self._buffer) + data = self._buffer[:size] + self._buffer = self._buffer[size:] + + # "free" the curl instance if the response is fully read. + # curl_cffi doesn't do this automatically and only allows one open response per thread + if not self._iterator and not self._buffer: + self.close() + exception_raised = False + return data + finally: + if exception_raised: + self.close() + + def close(self): + if not self.closed: + self._response.close() + self._buffer = b'' + super().close() + + +class CurlCFFIResponseAdapter(Response): + fp: CurlCFFIResponseReader + + def __init__(self, response: curl_cffi.requests.Response): + super().__init__( + fp=CurlCFFIResponseReader(response), + headers=response.headers, + url=response.url, + status=response.status_code) + + def read(self, amt=None): + try: + return self.fp.read(amt) + except curl_cffi.requests.errors.RequestsError as e: + if e.code == CurlECode.PARTIAL_FILE: + content_length = int_or_none(e.response.headers.get('Content-Length')) + raise IncompleteRead( + partial=self.fp.bytes_read, + expected=content_length - self.fp.bytes_read if content_length is not None else None, + cause=e) from e + raise TransportError(cause=e) from e + + +@register_rh +class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): + RH_NAME = 'curl_cffi' + _SUPPORTED_URL_SCHEMES = ('http', 'https') + _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) + _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') + _SUPPORTED_IMPERSONATE_TARGET_MAP = { + ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110, + ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107, + ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104, + ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101, + ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100, + ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99, + ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101, + ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99, + ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5, + ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3, + ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android, + } + + def _create_instance(self, cookiejar=None): + return curl_cffi.requests.Session(cookies=cookiejar) + + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + extensions.pop('impersonate', None) + extensions.pop('cookiejar', None) + extensions.pop('timeout', None) + + def _send(self, request: Request): + max_redirects_exceeded = False + session: curl_cffi.requests.Session = self._get_instance( + cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None) + + if self.verbose: + session.curl.setopt(CurlOpt.VERBOSE, 1) + + proxies = self._get_proxies(request) + if 'no' in proxies: + session.curl.setopt(CurlOpt.NOPROXY, proxies['no']) + proxies.pop('no', None) + + # curl doesn't support per protocol proxies, so we select the one that matches the request protocol + proxy = select_proxy(request.url, proxies=proxies) + if proxy: + session.curl.setopt(CurlOpt.PROXY, proxy) + scheme = urllib.parse.urlparse(request.url).scheme.lower() + if scheme != 'http': + # Enable HTTP CONNECT for HTTPS urls. + # Don't use CONNECT for http for compatibility with urllib behaviour. + # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html + session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1) + + headers = self._get_impersonate_headers(request) + + if self._client_cert: + session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate']) + client_certificate_key = self._client_cert.get('client_certificate_key') + client_certificate_password = self._client_cert.get('client_certificate_password') + if client_certificate_key: + session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key) + if client_certificate_password: + session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password) + + timeout = self._calculate_timeout(request) + + # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1] + # curl_cffi does not currently do this. [2] + # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3] + # [1] https://unix.stackexchange.com/a/305311 + # [2] https://github.com/yifeikong/curl_cffi/issues/156 + # [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html + session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second + session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout)) + + try: + curl_response = session.request( + method=request.method, + url=request.url, + headers=headers, + data=request.data, + verify=self.verify, + max_redirects=5, + timeout=timeout, + impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get( + self._get_request_target(request)), + interface=self.source_address, + stream=True + ) + except curl_cffi.requests.errors.RequestsError as e: + if e.code == CurlECode.PEER_FAILED_VERIFICATION: + raise CertificateVerifyError(cause=e) from e + + elif e.code == CurlECode.SSL_CONNECT_ERROR: + raise SSLError(cause=e) from e + + elif e.code == CurlECode.TOO_MANY_REDIRECTS: + max_redirects_exceeded = True + curl_response = e.response + + elif e.code == CurlECode.PROXY: + raise ProxyError(cause=e) from e + else: + raise TransportError(cause=e) from e + + response = CurlCFFIResponseAdapter(curl_response) + + if not 200 <= response.status < 300: + raise HTTPError(response, redirect_loop=max_redirects_exceeded) + + return response + + +@register_preference(CurlCFFIRH) +def curl_cffi_preference(rh, request): + return -100 diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 6545028c8143..e3edc77f3803 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -307,8 +307,7 @@ def _send(self, request): max_redirects_exceeded = False - session = self._get_instance( - cookiejar=request.extensions.get('cookiejar') or self.cookiejar) + session = self._get_instance(cookiejar=self._get_cookiejar(request)) try: requests_res = session.request( @@ -316,8 +315,8 @@ def _send(self, request): url=request.url, data=request.data, headers=headers, - timeout=float(request.extensions.get('timeout') or self.timeout), - proxies=request.proxies or self.proxies, + timeout=self._calculate_timeout(request), + proxies=self._get_proxies(request), allow_redirects=True, stream=True ) diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index cb4dae38168a..ff110dc29be9 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -389,11 +389,11 @@ def _send(self, request): ) opener = self._get_instance( - proxies=request.proxies or self.proxies, - cookiejar=request.extensions.get('cookiejar') or self.cookiejar + proxies=self._get_proxies(request), + cookiejar=self._get_cookiejar(request) ) try: - res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout)) + res = opener.open(urllib_req, timeout=self._calculate_timeout(request)) except urllib.error.HTTPError as e: if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): # Prevent file object from being closed when urllib.error.HTTPError is destroyed. diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index 159793204b12..6e235b0c624c 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import io import logging import ssl @@ -38,27 +39,40 @@ import websockets.sync.client from websockets.uri import parse_uri +# In websockets Connection, recv_exc and recv_events_exc are defined +# after the recv events handler thread is started [1]. +# On our CI using PyPy, in some cases a race condition may occur +# where the recv events handler thread tries to use these attributes before they are defined [2]. +# 1: https://github.com/python-websockets/websockets/blame/de768cf65e7e2b1a3b67854fb9e08816a5ff7050/src/websockets/sync/connection.py#L93 +# 2: "AttributeError: 'ClientConnection' object has no attribute 'recv_events_exc'. Did you mean: 'recv_events'?" +import websockets.sync.connection # isort: split +with contextlib.suppress(Exception): + # > 12.0 + websockets.sync.connection.Connection.recv_exc = None + # 12.0 + websockets.sync.connection.Connection.recv_events_exc = None + class WebsocketsResponseAdapter(WebSocketResponse): - def __init__(self, wsw: websockets.sync.client.ClientConnection, url): + def __init__(self, ws: websockets.sync.client.ClientConnection, url): super().__init__( - fp=io.BytesIO(wsw.response.body or b''), + fp=io.BytesIO(ws.response.body or b''), url=url, - headers=wsw.response.headers, - status=wsw.response.status_code, - reason=wsw.response.reason_phrase, + headers=ws.response.headers, + status=ws.response.status_code, + reason=ws.response.reason_phrase, ) - self.wsw = wsw + self._ws = ws def close(self): - self.wsw.close() + self._ws.close() super().close() def send(self, message): # https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send try: - return self.wsw.send(message) + return self._ws.send(message) except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e: raise TransportError(cause=e) from e except SocksProxyError as e: @@ -69,7 +83,7 @@ def send(self, message): def recv(self): # https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv try: - return self.wsw.recv() + return self._ws.recv() except SocksProxyError as e: raise ProxyError(cause=e) from e except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e: @@ -112,10 +126,10 @@ def close(self): logging.getLogger(name).removeHandler(handler) def _send(self, request): - timeout = float(request.extensions.get('timeout') or self.timeout) + timeout = self._calculate_timeout(request) headers = self._merge_headers(request.headers) if 'cookie' not in headers: - cookiejar = request.extensions.get('cookiejar') or self.cookiejar + cookiejar = self._get_cookiejar(request) cookie_header = cookiejar.get_cookie_header(request.url) if cookie_header: headers['cookie'] = cookie_header @@ -125,7 +139,7 @@ def _send(self, request): 'source_address': (self.source_address, 0) if self.source_address else None, 'timeout': timeout } - proxy = select_proxy(request.url, request.proxies or self.proxies or {}) + proxy = select_proxy(request.url, self._get_proxies(request)) try: if proxy: socks_proxy_options = make_socks_proxy_opts(proxy) diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 39442bae04b9..4c66ba66aaf3 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -256,6 +256,15 @@ def _make_sslcontext(self): def _merge_headers(self, request_headers): return HTTPHeaderDict(self.headers, request_headers) + def _calculate_timeout(self, request): + return float(request.extensions.get('timeout') or self.timeout) + + def _get_cookiejar(self, request): + return request.extensions.get('cookiejar') or self.cookiejar + + def _get_proxies(self, request): + return (request.proxies or self.proxies).copy() + def _check_url_scheme(self, request: Request): scheme = urllib.parse.urlparse(request.url).scheme.lower() if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES: @@ -454,9 +463,10 @@ def headers(self, new_headers: Mapping): else: raise TypeError('headers must be a mapping') - def update(self, url=None, data=None, headers=None, query=None): + def update(self, url=None, data=None, headers=None, query=None, extensions=None): self.data = data if data is not None else self.data self.headers.update(headers or {}) + self.extensions.update(extensions or {}) self.url = update_url_query(url or self.url, query or {}) def copy(self): @@ -491,7 +501,7 @@ class Response(io.IOBase): def __init__( self, - fp: typing.IO, + fp: io.IOBase, url: str, headers: Mapping[str, str], status: int = 200, diff --git a/yt_dlp/networking/impersonate.py b/yt_dlp/networking/impersonate.py new file mode 100644 index 000000000000..ca66180c707d --- /dev/null +++ b/yt_dlp/networking/impersonate.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import re +from abc import ABC +from dataclasses import dataclass +from typing import Any + +from .common import RequestHandler, register_preference +from .exceptions import UnsupportedRequest +from ..compat.types import NoneType +from ..utils import classproperty, join_nonempty +from ..utils.networking import std_headers + + +@dataclass(order=True, frozen=True) +class ImpersonateTarget: + """ + A target for browser impersonation. + + Parameters: + @param client: the client to impersonate + @param version: the client version to impersonate + @param os: the client OS to impersonate + @param os_version: the client OS version to impersonate + + Note: None is used to indicate to match any. + + """ + client: str | None = None + version: str | None = None + os: str | None = None + os_version: str | None = None + + def __post_init__(self): + if self.version and not self.client: + raise ValueError('client is required if version is set') + if self.os_version and not self.os: + raise ValueError('os is required if os_version is set') + + def __contains__(self, target: ImpersonateTarget): + if not isinstance(target, ImpersonateTarget): + return False + return ( + (self.client is None or target.client is None or self.client == target.client) + and (self.version is None or target.version is None or self.version == target.version) + and (self.os is None or target.os is None or self.os == target.os) + and (self.os_version is None or target.os_version is None or self.os_version == target.os_version) + ) + + def __str__(self): + return f'{join_nonempty(self.client, self.version)}:{join_nonempty(self.os, self.os_version)}'.rstrip(':') + + @classmethod + def from_str(cls, target: str): + mobj = re.fullmatch(r'(?:(?P[^:-]+)(?:-(?P[^:-]+))?)?(?::(?:(?P[^:-]+)(?:-(?P[^:-]+))?)?)?', target) + if not mobj: + raise ValueError(f'Invalid impersonate target "{target}"') + return cls(**mobj.groupdict()) + + +class ImpersonateRequestHandler(RequestHandler, ABC): + """ + Base class for request handlers that support browser impersonation. + + This provides a method for checking the validity of the impersonate extension, + which can be used in _check_extensions. + + Impersonate targets consist of a client, version, os and os_ver. + See the ImpersonateTarget class for more details. + + The following may be defined: + - `_SUPPORTED_IMPERSONATE_TARGET_MAP`: a dict mapping supported targets to custom object. + Any Request with an impersonate target not in this list will raise an UnsupportedRequest. + Set to None to disable this check. + Note: Entries are in order of preference + + Parameters: + @param impersonate: the default impersonate target to use for requests. + Set to None to disable impersonation. + """ + _SUPPORTED_IMPERSONATE_TARGET_MAP: dict[ImpersonateTarget, Any] = {} + + def __init__(self, *, impersonate: ImpersonateTarget = None, **kwargs): + super().__init__(**kwargs) + self.impersonate = impersonate + + def _check_impersonate_target(self, target: ImpersonateTarget): + assert isinstance(target, (ImpersonateTarget, NoneType)) + if target is None or not self.supported_targets: + return + if not self.is_supported_target(target): + raise UnsupportedRequest(f'Unsupported impersonate target: {target}') + + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + if 'impersonate' in extensions: + self._check_impersonate_target(extensions.get('impersonate')) + + def _validate(self, request): + super()._validate(request) + self._check_impersonate_target(self.impersonate) + + def _resolve_target(self, target: ImpersonateTarget | None): + """Resolve a target to a supported target.""" + if target is None: + return + for supported_target in self.supported_targets: + if target in supported_target: + if self.verbose: + self._logger.stdout( + f'{self.RH_NAME}: resolved impersonate target {target} to {supported_target}') + return supported_target + + @classproperty + def supported_targets(self) -> tuple[ImpersonateTarget, ...]: + return tuple(self._SUPPORTED_IMPERSONATE_TARGET_MAP.keys()) + + def is_supported_target(self, target: ImpersonateTarget): + assert isinstance(target, ImpersonateTarget) + return self._resolve_target(target) is not None + + def _get_request_target(self, request): + """Get the requested target for the request""" + return self._resolve_target(request.extensions.get('impersonate') or self.impersonate) + + def _get_impersonate_headers(self, request): + headers = self._merge_headers(request.headers) + if self._get_request_target(request) is not None: + # remove all headers present in std_headers + # todo: change this to not depend on std_headers + for k, v in std_headers.items(): + if headers.get(k) == v: + headers.pop(k) + return headers + + +@register_preference(ImpersonateRequestHandler) +def impersonate_preference(rh, request): + if request.extensions.get('impersonate') or rh.impersonate: + return 1000 + return 0 diff --git a/yt_dlp/options.py b/yt_dlp/options.py index f8847273128a..43d71ef070be 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -515,6 +515,18 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): metavar='IP', dest='source_address', default=None, help='Client-side IP address to bind to', ) + network.add_option( + '--impersonate', + metavar='CLIENT[:OS]', dest='impersonate', default=None, + help=( + 'Client to impersonate for requests. E.g. chrome, chrome-110, chrome:windows-10. ' + 'Pass --impersonate="" to impersonate any client.'), + ) + network.add_option( + '--list-impersonate-targets', + dest='list_impersonate_targets', default=False, action='store_true', + help='List available clients to impersonate.', + ) network.add_option( '-4', '--force-ipv4', action='store_const', const='0.0.0.0', dest='source_address', @@ -679,6 +691,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--break-on-existing', action='store_true', dest='break_on_existing', default=False, help='Stop the download process when encountering a file that is in the archive') + selection.add_option( + '--no-break-on-existing', + action='store_false', dest='break_on_existing', + help='Do not stop the download process when encountering a file that is in the archive (default)') selection.add_option( '--break-on-reject', action='store_true', dest='break_on_reject', default=False, diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 9efeb6a1c142..dec514674f5c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5,7 +5,7 @@ import collections import collections.abc import contextlib -import datetime +import datetime as dt import email.header import email.utils import errno @@ -1150,14 +1150,14 @@ def extract_timezone(date_str): timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) if timezone is not None: date_str = date_str[:-len(m.group('tz'))] - timezone = datetime.timedelta(hours=timezone or 0) + timezone = dt.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): - timezone = datetime.timedelta() + timezone = dt.timedelta() else: sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( + timezone = dt.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) return timezone, date_str @@ -1176,8 +1176,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): with contextlib.suppress(ValueError): date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + dt_ = dt.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt_.timetuple()) def date_formats(day_first=True): @@ -1198,12 +1198,12 @@ def unified_strdate(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): - upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d') if upload_date is None: timetuple = email.utils.parsedate_tz(date_str) if timetuple: with contextlib.suppress(ValueError): - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d') if upload_date is not None: return str(upload_date) @@ -1233,8 +1233,8 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): - dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) + dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta) + return calendar.timegm(dt_.timetuple()) timetuple = email.utils.parsedate_tz(date_str) if timetuple: @@ -1272,11 +1272,11 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision) + today = datetime_round(dt.datetime.now(dt.timezone.utc), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': - return today - datetime.timedelta(days=1) + return today - dt.timedelta(days=1) match = re.match( r'(?P.+)(?P[+-])(?P