Skip to content

Commit

Permalink
feat: adding youtube transcription tool
Browse files Browse the repository at this point in the history
  • Loading branch information
chr-hertel committed Sep 21, 2024
1 parent ea023f8 commit 66f90e9
Show file tree
Hide file tree
Showing 5 changed files with 313 additions and 2 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Provided Tools
* [x] Clock
* [x] Wikipedia
* [x] Weather
* [x] YouTube Transcriber

Usage Examples
--------------
Expand Down Expand Up @@ -105,6 +106,18 @@ Depending on the example you need to export needed environment variables for API
php examples/toolchain-serpapi.php
```

1. Weather Tool
```bash
export OPENAI_API_KEY=sk-...
php examples/toolchain-weather.php
```

1. YouTube Transcriber Tool
```bash
export OPENAI_API_KEY=sk-...
php examples/toolchain-youtube.php
```

### Structured Output

1. Structured Output Example: OpenAI's GPT
Expand Down
6 changes: 5 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,16 @@
"probots-io/pinecone-php": "^1.0",
"symfony/clock": "^6.4 || ^7.1",
"symfony/console": "^6.4 || ^7.1",
"symfony/css-selector": "^6.4 || ^7.1",
"symfony/dom-crawler": "^6.4 || ^7.1",
"symfony/var-dumper": "^6.4 || ^7.1"
},
"suggest": {
"codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.",
"probots-io/pinecone-php": "For using the Pinecone as retrieval vector store.",
"symfony/clock": "For using the clock tool."
"symfony/clock": "For using the clock tool.",
"symfony/css-selector": "For using the YouTube transcription tool.",
"symfony/dom-crawler": "For using the YouTube transcription tool."
},
"config": {
"sort-packages": true
Expand Down
201 changes: 200 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions examples/toolchain-youtube.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

use PhpLlm\LlmChain\Chain;
use PhpLlm\LlmChain\Message\Message;
use PhpLlm\LlmChain\Message\MessageBag;
use PhpLlm\LlmChain\OpenAI\Model\Gpt;
use PhpLlm\LlmChain\OpenAI\Model\Gpt\Version;
use PhpLlm\LlmChain\OpenAI\Runtime\OpenAI;
use PhpLlm\LlmChain\ToolBox\ParameterAnalyzer;
use PhpLlm\LlmChain\ToolBox\Registry;
use PhpLlm\LlmChain\ToolBox\Tool\YouTubeTranscriber;
use PhpLlm\LlmChain\ToolBox\ToolAnalyzer;
use Symfony\Component\HttpClient\HttpClient;

require_once dirname(__DIR__).'/vendor/autoload.php';

$httpClient = HttpClient::create();
$runtime = new OpenAI($httpClient, getenv('OPENAI_API_KEY'));
$llm = new Gpt($runtime, Version::GPT_4o_MINI);

$transcriber = new YouTubeTranscriber($httpClient);
$registry = new Registry(new ToolAnalyzer(new ParameterAnalyzer()), [$transcriber]);
$chain = new Chain($llm, $registry);

$messages = new MessageBag(Message::ofUser('Please summarize this video for me: https://www.youtube.com/watch?v=6uXW-ulpj0s'));
$response = $chain->call($messages);

echo $response.PHP_EOL;
67 changes: 67 additions & 0 deletions src/ToolBox/Tool/YouTubeTranscriber.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\ToolBox\Tool;

use PhpLlm\LlmChain\ToolBox\AsTool;
use Symfony\Component\CssSelector\CssSelectorConverter;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;

#[AsTool('youtube_transcript', 'Fetches the transcript of a YouTube video')]
final readonly class YouTubeTranscriber
{
public function __construct(
private HttpClientInterface $client,
) {
if (!class_exists(Crawler::class)) {
throw new \LogicException('The Symfony DomCrawler component is required to use this tool.');
}
if (!class_exists(CssSelectorConverter::class)) {
throw new \LogicException('The Symfony CSS Selector component is required to use this tool.');
}
}

/**
* @param string $videoId The ID of the YouTube video
*/
public function __invoke(string $videoId): string
{
// Fetch the HTML content of the YouTube video page
$htmlResponse = $this->client->request('GET', 'https://youtube.com/watch?v='.$videoId);
$html = $htmlResponse->getContent();

// Use DomCrawler to parse the HTML
$crawler = new Crawler($html);

// Extract the script containing the ytInitialPlayerResponse
$scriptContent = $crawler->filter('script')->reduce(function (Crawler $node) {
return str_contains($node->text(), 'var ytInitialPlayerResponse = {');
})->text();

// Extract and parse the JSON data from the script
$start = strpos($scriptContent, 'var ytInitialPlayerResponse = ') + strlen('var ytInitialPlayerResponse = ');
$dataString = substr($scriptContent, $start);
$dataString = substr($dataString, 0, strrpos($dataString, ';') ?: null);
$data = json_decode(trim($dataString), true);

// Extract the URL for the captions
if (!isset($data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl'])) {
throw new \Exception('Captions are not available for this video.');
}
$captionsUrl = $data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl'];

// Fetch and parse the captions XML
$xmlResponse = $this->client->request('GET', $captionsUrl);
$xmlContent = $xmlResponse->getContent();
$xmlCrawler = new Crawler($xmlContent);

// Collect all text elements from the captions
$transcript = $xmlCrawler->filter('text')->each(function (Crawler $node) {
return $node->text().' ';
});

return implode(PHP_EOL, $transcript);
}
}

0 comments on commit 66f90e9

Please sign in to comment.