Skip to content
This repository has been archived by the owner on Jul 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #520 from kyotoyx/youtube_transcript
Browse files Browse the repository at this point in the history
[v1.3][ISSUE-519] YoutubeLoader optimization using transcript api and pytube.
  • Loading branch information
xuechendi authored Jan 9, 2024
2 parents 93239e5 + 5fa6318 commit 498cb37
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 29 deletions.
62 changes: 50 additions & 12 deletions RecDP/pyrecdp/primitives/document/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,17 +412,50 @@ def read_from_langchain(loader: str, loader_kwargs: Optional[dict[str, Any]] = N
return [{'text': doc.page_content, 'metadata': doc.metadata} for doc in loader.load()]


def read_youtube_audio(url: Union[str, List[str]], save_dir: Optional[str] = None, model_name: Optional[str] = None):
import os
import tempfile
import shutil
def transcribe_youtube_video(url: Union[str, List[str]], save_dir: Optional[str] = None, model_name: Optional[str] = None):
"""Load from YouTube transcript (if available) or audio.
Args:
url (Union[str, List[str]]): YouTube URL or list of URLs.
save_dir (Optional[str]): Directory to save the audio files.
model_name (Optional[str]): Model name for the transcription service.
Returns:
List[Dict]: List of documents with text and metadata.
"""
urls = [url] if isinstance(url, str) else url
use_temp_dir = False
if save_dir is None or not os.path.isdir(save_dir):
use_temp_dir = True

# Try to load transcript from YouTube
from langchain_community.document_loaders import YoutubeLoader
pending_urls = []
doc_list = []

# Try to load transcript from YouTube
for url in urls:
try:
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
docs = loader.load()
if not docs:
logger.info(f'No transcript available for URL: {url}. Defaulting to audio transcription.')
pending_urls.append(url)
continue
for doc in docs:
doc_list.append({'text': doc.page_content, 'metadata': doc.metadata})
except Exception as e:
logger.warning(f'Warning: Failed to load transcript from URL {url}. This may be due to language mismatch. Defaulting to audio transcription.')
pending_urls.append(url)
if len(pending_urls) == 0:
return doc_list

# If transcripts are not available, use Whisper for audio to text conversion
import tempfile
import shutil
use_temp_dir = save_dir is None
if use_temp_dir:
save_dir = tempfile.mkdtemp()
docs = []
else:
if not os.path.exists(save_dir):
os.makedirs(save_dir)
try:
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
loader = YoutubeAudioLoader(urls, save_dir)
Expand All @@ -431,11 +464,16 @@ def read_youtube_audio(url: Union[str, List[str]], save_dir: Optional[str] = Non
audio_paths[url] = str(blob.path)
import whisper
model = whisper.load_model(model_name)
for url, audio_path in audio_paths.items():
result = model.transcribe(audio_path)
docs.append({'text': result['text'], 'metadata': {"source": url, 'language': result['language']}})
from pytube import Youtube
audio_paths = {}
for url in urls:
video = YouTube(url)
audio = video.streams.filter(only_audio=True, file_extension='mp4').first()
audio_path = audio.download(output_path=save_dir)
transcribe_result = model.transcribe(audio_path)
doc_list.append({'text': transcribe_result['text'], 'metadata': {'source': url, 'language': transcribe_result['language']}})
finally:
if use_temp_dir:
shutil.rmtree(save_dir)

return docs
return doc_list
10 changes: 5 additions & 5 deletions RecDP/pyrecdp/primitives/operations/doc_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,13 @@ def __init__(self, urls: List[str], save_dir: str = None, model='small',
self.model_name = model
self.num_cpus = num_cpus
os.system("apt-get -qq -y install ffmpeg")
check_availability_and_install(['langchain', 'yt_dlp', 'openai-whisper'])
check_availability_and_install(['langchain', 'pytube', 'openai-whisper', 'youtube-transcript-api'])

def process_rayds(self, ds=None):
import ray
url_ds = ray.data.from_items([{'url': url} for url in self.urls])
from pyrecdp.primitives.document.reader import read_youtube_audio
self.cache = url_ds.flat_map(lambda record: read_youtube_audio(record['url'], self.save_dir, self.model_name),
from pyrecdp.primitives.document.reader import transcribe_youtube_video
self.cache = url_ds.flat_map(lambda record: transcribe_youtube_video(record['url'], self.save_dir, self.model_name),
num_cpus=self.num_cpus)
if ds is not None:
self.cache = self.union_ray_ds(ds, self.cache)
Expand All @@ -223,9 +223,9 @@ def process_spark(self, spark, spark_df=None):
]))
])

from pyrecdp.primitives.document.reader import read_youtube_audio
from pyrecdp.primitives.document.reader import transcribe_youtube_video
docs_rdd = urls_df.rdd.flatMap(
lambda row: read_youtube_audio(row['value'], self.save_dir, self.model_name))
lambda row: transcribe_youtube_video(row['value'], self.save_dir, self.model_name))

self.cache = spark.createDataFrame(docs_rdd, schema)
if spark_df is not None:
Expand Down
24 changes: 12 additions & 12 deletions RecDP/tests/test_llmutils_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,6 @@ class Test_LLMUtils_Operations(unittest.TestCase):
def setUp(self):
print(f"\n******\nTesting Method Name: {self._testMethodName}\n******")

### ====== Priority execution ====== ###
def a_test_youtube_load_spark(self):
urls = ["https://www.youtube.com/watch?v=J31r79uUi9M", "https://www.youtube.com/watch?v=w9kq1BjqrfE"]
op = YoutubeLoader(urls)
with SparkContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_spark(ctx.spark))

def a_test_youtube_load_ray(self):
urls = ["https://www.youtube.com/watch?v=J31r79uUi9M", "https://www.youtube.com/watch?v=w9kq1BjqrfE"]
op = YoutubeLoader(urls)
with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_rayds())
### ====== Ray ====== ###

def test_bytesize_ray(self):
Expand Down Expand Up @@ -321,6 +309,12 @@ def test_document_embed_faiss_ray(self):
)
with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_rayds(ctx.ds))

def test_youtube_load_ray(self):
urls = ["https://www.youtube.com/watch?v=J31r79uUi9M", "https://www.youtube.com/watch?v=w9kq1BjqrfE"]
op = YoutubeLoader(urls)
with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_rayds())

### ====== Spark ====== ###

Expand Down Expand Up @@ -564,3 +558,9 @@ def test_document_loader_spark(self):
op = DocumentLoader(loader='RecursiveUrlLoader', loader_args={'url': url})
with SparkContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_spark(ctx.spark))

def test_youtube_load_spark(self):
urls = ["https://www.youtube.com/watch?v=J31r79uUi9M", "https://www.youtube.com/watch?v=w9kq1BjqrfE"]
op = YoutubeLoader(urls)
with SparkContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
ctx.show(op.process_spark(ctx.spark))

0 comments on commit 498cb37

Please sign in to comment.