From 95d4af6671140a036395c5a741b9b58a176ea04c Mon Sep 17 00:00:00 2001 From: Brian Le Date: Sun, 8 Sep 2024 16:34:53 -0700 Subject: [PATCH 1/4] Support POT generation via script --- .../yt_dlp_plugins/extractor/getpot_bgutil.py | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py b/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py index f2f887c..d161a9e 100644 --- a/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py +++ b/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py @@ -1,4 +1,5 @@ import json +import subprocess from yt_dlp import YoutubeDL from yt_dlp.networking.common import Request @@ -16,6 +17,18 @@ def _validate_get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data raise UnsupportedRequest('One of [data_sync_id, visitor_data] must be passed') def _get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id=None, player_url=None, **kwargs) -> str: + generate_pot_script_path = ydl.get_info_extractor('Youtube')._configuration_arg('getpot_bgutil_script', [None], casesense=True)[0] + if generate_pot_script_path: + self._logger.info(f"Generating POT via script: {generate_pot_script_path}") + po_token = self._get_pot_via_script(generate_pot_script_path, visitor_data, data_sync_id) + return po_token + else: + self._logger.info(f"Generating POT via HTTP server") + po_token = self._get_pot_via_http(ydl, client, visitor_data, data_sync_id) + + return po_token + + def _get_pot_via_http(self, ydl, client, visitor_data, data_sync_id): response = ydl.urlopen(Request('http://127.0.0.1:4416/get_pot', data=json.dumps({ 'client': client, 'visitor_data': visitor_data, @@ -27,5 +40,26 @@ def _get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id= if 'po_token' not in response_json: raise RequestError('Server did not respond with a po_token') - self._logger.debug(f'Got PO Token: {response_json["po_token"]}') - return response_json['po_token'] + return response_json["po_token"] + + def _get_pot_via_script(self, script_path, visitor_data, data_sync_id): + # possibly vulnerable to shell injection here? but risk is low + command_args = ['node', script_path] + if data_sync_id: + command_args.extend(["-d", data_sync_id]) + elif visitor_data: + command_args.extend(["-v", visitor_data]) + else: + raise RequestError("Unexpected missing visitorData/dataSyncId in _get_pot_via_script") + self._logger.debug(f"Executing command to get POT via script: {' '.join(command_args)}") + + result = subprocess.run(command_args,stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + self._logger.debug(f"stdout = {result.stdout}") + if result.stderr or result.returncode != 0: + raise RequestError(f"_get_pot_via_script failed with return code {result.returncode}. stderr = {result.stderr}") + + script_data_resp = result.stdout.splitlines()[-1] + self._logger.debug(f"_get_pot_via_script response = {script_data_resp}") + response = json.loads(script_data_resp) + return response['poToken'] From bc2ed145c7b15a76d06ea11755961a932c2d5e61 Mon Sep 17 00:00:00 2001 From: Brian Le Date: Sun, 8 Sep 2024 16:53:02 -0700 Subject: [PATCH 2/4] Update README --- README.md | 67 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 2690bb9..9971ef8 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,9 @@ This is used to bypass the 'Sign in to confirm you're not a bot' message when in The provider comes in two parts: -1. **Provider**: An HTTP server that generates the POT, and has interfaces for the plugin to retrieve data from +1. **Provider**: Two options - + - An HTTP server that generates the POT, and has interfaces for the plugin to retrieve data from (easy setup + docker image provided) + - A POT generation script supplied via extractor arguments 2. **Provider plugin**: uses POT plugin framework to retrieve data from the provider, allowing yt-dlp to simulate having passed the 'bot check' ## Installation @@ -21,17 +23,27 @@ The provider comes in two parts: Default port number is 4416. If you want to change this, be sure to change it in both the provider and plugin code. +### Base Requirements + +If using Docker image for option (a) for the provider, the Docker runtime is required. + +Otherwise, Node.js and Yarn are required. You will also need to clone the repository. + ### 1. Set up the provider +There are two options for the provider, an always running POT generation HTTP server, and a POT generation script invoked when needed. The HTTP server option is simpler, and comes with a prebuilt Docker image. **You only need to choose one option.** + +#### (a) HTTP Server Option + The provider is a Node.js HTTP server. You have two options of running it: as a prebuilt docker image, or manually as a node application. -#### Docker: +**Docker:** ```shell docker run --name bgutil-provider -d -p 4416:4416 brainicism/bgutil-ytdlp-pot-provider ``` -#### Native: +**Native:** ```shell cd server/ @@ -40,6 +52,31 @@ npx tsc node build/main.js ``` +
+ Server Endpoints/Environment Variables + +**Environment Variables** + +- **TOKEN_TTL**: The time in hours for a PO token to be considered valid. While there are no definitive answers on how long a token is valid, it has been observed to be valid for atleast a couple of days. Default: 6 + +**Endpoints** + +- **POST /get_pot**: Accepts a `visitor_data` (unauthenticated), `data_sync_id` (authenticated) or an empty body in the request body. If no identifier is passed, a new unauthenticated `visitor_data` will be generated. Returns `po_token` and the associated identifier `visit_identifier`. +- **POST /invalidate_caches**: Resets the PO token cache, forcing new tokens to be generated on next fetch +
+ +#### (b) Generation Script Option + +The generation script needs to be transpiled to Javascript before it can be used by the plugin. + +```shell +cd server/ +yarn install --frozen-lockfile +npx tsc +``` + +Make sure `node` is available in your `PATH`. + ### 2. Install the plugin #### PyPI: @@ -57,28 +94,10 @@ This will automatically install [coletdjnz's POT plugin framework](https://githu ## Usage -### Environment Variables - -- **TOKEN_TTL**: The time in hours for a PO token to be considered valid. While there are no definitive answers on how long a token is valid, it has been observed to be valid for atleast a couple of days. Default: 6 - -### Endpoints - -- **POST /get_pot**: Accepts a `visitor_data` (unauthenticated), `data_sync_id` (authenticated) or an empty body in the request body. If no identifier is passed, a new unauthenticated `visitor_data` will be generated. Returns `po_token` and the associated identifier `visit_identifier`. -- **POST /invalidate_caches**: Resets the PO token cache, forcing new tokens to be generated on next fetch - -### Server-less +If using option (a) HTTP Server for the provider, use yt-dlp like normal 🙂. -If you don't need to programatically generate POTs, you can use the `generate_once` script to generate a POT. +If using option (b) script for the provider, you need to pass extractor arguments including the path to the generation script for each yt-dlp call. Make sure to point to the transpiled version, `server/build/generate_once.js` ```shell -cd server/ -yarn install --frozen-lockfile -npx tsc -node build/generate_once.js ## can pass "-v [visitor_data]" or "-d [data_sync_id]" if needed as well -``` - -Output: - -``` -{"visitIdentifier":"C*****************************************%3D%3D","poToken":"M******************************************************************************************************************************************************************=","generatedAt":"2024-09-08T02:57:52.283Z"} +./yt-dlp --extractor-args "youtube:getpot_bgutil_script=/home/user/bgutil-test/bgutil-ytdlp-pot-provider/server/build/generate_once.js" ``` From e0068e3082500ff30e8776f28062f73f9a2c8cb2 Mon Sep 17 00:00:00 2001 From: Brian Le Date: Sun, 8 Sep 2024 16:55:40 -0700 Subject: [PATCH 3/4] minor changes --- plugin/yt_dlp_plugins/extractor/getpot_bgutil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py b/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py index d161a9e..14c76af 100644 --- a/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py +++ b/plugin/yt_dlp_plugins/extractor/getpot_bgutil.py @@ -59,7 +59,7 @@ def _get_pot_via_script(self, script_path, visitor_data, data_sync_id): if result.stderr or result.returncode != 0: raise RequestError(f"_get_pot_via_script failed with return code {result.returncode}. stderr = {result.stderr}") + # the JSON response is always the last line script_data_resp = result.stdout.splitlines()[-1] self._logger.debug(f"_get_pot_via_script response = {script_data_resp}") - response = json.loads(script_data_resp) - return response['poToken'] + return json.loads(script_data_resp)['poToken'] From 2ce4ec5e4212260bbb70ee0f6c9384dec207a8f3 Mon Sep 17 00:00:00 2001 From: Brian Le Date: Sun, 8 Sep 2024 17:26:37 -0700 Subject: [PATCH 4/4] Update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9971ef8..2ffd524 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ This is used to bypass the 'Sign in to confirm you're not a bot' message when in The provider comes in two parts: 1. **Provider**: Two options - - - An HTTP server that generates the POT, and has interfaces for the plugin to retrieve data from (easy setup + docker image provided) - - A POT generation script supplied via extractor arguments + - (a) An HTTP server that generates the POT, and has interfaces for the plugin to retrieve data from (easy setup + docker image provided) + - (b) A POT generation script supplied via extractor arguments 2. **Provider plugin**: uses POT plugin framework to retrieve data from the provider, allowing yt-dlp to simulate having passed the 'bot check' ## Installation