From 66f90e9aca486e0c709aeb44efc3f36b2b8a0016 Mon Sep 17 00:00:00 2001 From: Christopher Hertel Date: Sat, 21 Sep 2024 23:26:40 +0200 Subject: [PATCH] feat: adding youtube transcription tool --- README.md | 13 ++ composer.json | 6 +- composer.lock | 201 +++++++++++++++++++++++- examples/toolchain-youtube.php | 28 ++++ src/ToolBox/Tool/YouTubeTranscriber.php | 67 ++++++++ 5 files changed, 313 insertions(+), 2 deletions(-) create mode 100755 examples/toolchain-youtube.php create mode 100644 src/ToolBox/Tool/YouTubeTranscriber.php diff --git a/README.md b/README.md index 0a353e7..c9b101a 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ Provided Tools * [x] Clock * [x] Wikipedia * [x] Weather +* [x] YouTube Transcriber Usage Examples -------------- @@ -105,6 +106,18 @@ Depending on the example you need to export needed environment variables for API php examples/toolchain-serpapi.php ``` +1. Weather Tool + ```bash + export OPENAI_API_KEY=sk-... + php examples/toolchain-weather.php + ``` + +1. YouTube Transcriber Tool + ```bash + export OPENAI_API_KEY=sk-... + php examples/toolchain-youtube.php + ``` + ### Structured Output 1. Structured Output Example: OpenAI's GPT diff --git a/composer.json b/composer.json index aa4eb17..0b18e6b 100644 --- a/composer.json +++ b/composer.json @@ -29,12 +29,16 @@ "probots-io/pinecone-php": "^1.0", "symfony/clock": "^6.4 || ^7.1", "symfony/console": "^6.4 || ^7.1", + "symfony/css-selector": "^6.4 || ^7.1", + "symfony/dom-crawler": "^6.4 || ^7.1", "symfony/var-dumper": "^6.4 || ^7.1" }, "suggest": { "codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.", "probots-io/pinecone-php": "For using the Pinecone as retrieval vector store.", - "symfony/clock": "For using the clock tool." + "symfony/clock": "For using the clock tool.", + "symfony/css-selector": "For using the YouTube transcription tool.", + "symfony/dom-crawler": "For using the YouTube transcription tool." }, "config": { "sort-packages": true diff --git a/composer.lock b/composer.lock index c6e68e2..b08c2fd 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "71fd4420733a7abfe8b647efc5bac701", + "content-hash": "317e154b61bb699eae1c61f5bac6038d", "packages": [ { "name": "doctrine/deprecations", @@ -2090,6 +2090,73 @@ ], "time": "2024-07-18T11:15:46+00:00" }, + { + "name": "masterminds/html5", + "version": "2.9.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "f5ac2c0b0a2eefca70b2ce32a5809992227e75a6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/f5ac2c0b0a2eefca70b2ce32a5809992227e75a6", + "reference": "f5ac2c0b0a2eefca70b2ce32a5809992227e75a6", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.9.0" + }, + "time": "2024-03-31T07:05:07+00:00" + }, { "name": "myclabs/deep-copy", "version": "1.12.0", @@ -4340,6 +4407,138 @@ ], "time": "2024-09-20T08:28:38+00:00" }, + { + "name": "symfony/css-selector", + "version": "v7.1.1", + "source": { + "type": "git", + "url": "https://github.com/symfony/css-selector.git", + "reference": "1c7cee86c6f812896af54434f8ce29c8d94f9ff4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/css-selector/zipball/1c7cee86c6f812896af54434f8ce29c8d94f9ff4", + "reference": "1c7cee86c6f812896af54434f8ce29c8d94f9ff4", + "shasum": "" + }, + "require": { + "php": ">=8.2" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\CssSelector\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Jean-François Simon", + "email": "jeanfrancois.simon@sensiolabs.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Converts CSS selectors to XPath expressions", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/css-selector/tree/v7.1.1" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-05-31T14:57:53+00:00" + }, + { + "name": "symfony/dom-crawler", + "version": "v7.1.5", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "b92af238457a7cdd2738f941cd525d76313e8283" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/b92af238457a7cdd2738f941cd525d76313e8283", + "reference": "b92af238457a7cdd2738f941cd525d76313e8283", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.2", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^6.4|^7.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v7.1.5" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-15T06:48:17+00:00" + }, { "name": "symfony/polyfill-php83", "version": "v1.31.0", diff --git a/examples/toolchain-youtube.php b/examples/toolchain-youtube.php new file mode 100755 index 0000000..28c5ab9 --- /dev/null +++ b/examples/toolchain-youtube.php @@ -0,0 +1,28 @@ +call($messages); + +echo $response.PHP_EOL; diff --git a/src/ToolBox/Tool/YouTubeTranscriber.php b/src/ToolBox/Tool/YouTubeTranscriber.php new file mode 100644 index 0000000..60abce6 --- /dev/null +++ b/src/ToolBox/Tool/YouTubeTranscriber.php @@ -0,0 +1,67 @@ +client->request('GET', 'https://youtube.com/watch?v='.$videoId); + $html = $htmlResponse->getContent(); + + // Use DomCrawler to parse the HTML + $crawler = new Crawler($html); + + // Extract the script containing the ytInitialPlayerResponse + $scriptContent = $crawler->filter('script')->reduce(function (Crawler $node) { + return str_contains($node->text(), 'var ytInitialPlayerResponse = {'); + })->text(); + + // Extract and parse the JSON data from the script + $start = strpos($scriptContent, 'var ytInitialPlayerResponse = ') + strlen('var ytInitialPlayerResponse = '); + $dataString = substr($scriptContent, $start); + $dataString = substr($dataString, 0, strrpos($dataString, ';') ?: null); + $data = json_decode(trim($dataString), true); + + // Extract the URL for the captions + if (!isset($data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl'])) { + throw new \Exception('Captions are not available for this video.'); + } + $captionsUrl = $data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl']; + + // Fetch and parse the captions XML + $xmlResponse = $this->client->request('GET', $captionsUrl); + $xmlContent = $xmlResponse->getContent(); + $xmlCrawler = new Crawler($xmlContent); + + // Collect all text elements from the captions + $transcript = $xmlCrawler->filter('text')->each(function (Crawler $node) { + return $node->text().' '; + }); + + return implode(PHP_EOL, $transcript); + } +}