Skip to content

Commit

Permalink
Update channel id for Sheinbaum, use logging instead of print
Browse files Browse the repository at this point in the history
  • Loading branch information
ivansabik committed Oct 18, 2024
1 parent cf3f990 commit 92d3445
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 22 deletions.
45 changes: 31 additions & 14 deletions process.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import datetime
import json
import os
import xml

import requests
import scrapetube
from aws_lambda_powertools import Logger
from timelength import TimeLength
from youtube_transcript_api import YouTubeTranscriptApi, _errors

AMLO_CHANNEL_ID = "UCxEgOKuI-n-WOJaNcisHvSg"
SHEINBAUM_CHANNEL_ID = "UCvzHrtf9by1-UY67SfZse8w"

def handler():
logger = Logger()


def handler(channel_id):
# Get a list of all playlists in the channel
api_key = os.environ["YOUTUBE_V3_API_KEY"]
channel_id = "UCxEgOKuI-n-WOJaNcisHvSg"
playlists = requests.get(
"https://www.googleapis.com/youtube/v3/playlists",
params={
Expand Down Expand Up @@ -47,27 +53,32 @@ def _process_video(video_metadata):
video_id = video_metadata["videoId"]

# Check if video has already been processed
processed_local_path = f"data/{video_id}.json"
if os.path.isfile(processed_local_path):
return
failed_path = f"failed/{video_id}.json"
if os.path.isfile(failed_path):
return
for path in ["data", "failed"]:
local_path = f"{path}/{video_id}.json"
if os.path.isfile(local_path):
logger.info("File already exists", extra={"local_path": local_path})
return

# Retrieve or generate transcriptions
failed_path = f"failed/{video_id}.json"
logger.info("Obtaining transcriptions", extra={"video_id": video_id})
try:
transcription_with_timestamps = YouTubeTranscriptApi.get_transcript(
video_id, languages=["es"]
)
except _errors.TranscriptsDisabled:
print(f"Transcripts are disabled for video {video_id}")
logger.warning("Transcripts are disabled", extra={"video_id": video_id})
with open(failed_path, "w") as _file:
json.dump(video_metadata, _file, indent=4)
return
# See https://github.com/jdepoix/youtube-transcript-api/issues/320
except xml.etree.ElementTree.ParseError:
logger.warning("Retrieving transcript failed", extra={"video_id": video_id})
return
# Language for some videos is not Spanish - ES
# Example: https://www.youtube.com/watch?v=k_rBgKb1y8U
except _errors.NoTranscriptFound:
print(f"No transcript available for video {video_id}")
logger.warning("No transcript available", extra={"video_id": video_id})
with open(failed_path, "w") as _file:
json.dump(video_metadata, _file, indent=4)
return
Expand All @@ -81,7 +92,9 @@ def _process_video(video_metadata):
if not video_metadata.get("videoInfo"):
published_time_text = video_metadata["publishedTimeText"]["simpleText"]
video_length = video_metadata["lengthText"]["accessibility"]["accessibilityData"]["label"]
video_length_seconds = TimeLength(video_length).total_seconds
video_length_seconds = TimeLength(video_length)
assert video_length_seconds.result.success
video_length_seconds = video_length_seconds.result.seconds
video_length_seconds = int(video_length_seconds)
else:
published_time_text = video_metadata["videoInfo"]["runs"][-1]["text"]
Expand All @@ -99,13 +112,17 @@ def _process_video(video_metadata):
"playlist_id": video_metadata["playlist_id"],
"playlist_title": video_metadata["playlist_title"],
"published_time_text": published_time_text,
"retrieved_time": str(datetime.datetime.utcnow()),
"retrieved_time": str(datetime.datetime.now(datetime.timezone.utc)),
}
with open(processed_local_path, "w") as _file:
with open(local_path, "w") as _file:
json.dump(video, _file, indent=4)

return video


if __name__ == "__main__":
handler()
if os.getenv("AMLO"):
channel_id = AMLO_CHANNEL_ID
else:
channel_id = SHEINBAUM_CHANNEL_ID
handler(channel_id)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
aws-lambda-powertools
ffmpeg-python
openai-whisper
pandas
Expand Down
18 changes: 10 additions & 8 deletions transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import os

import whisper
from aws_lambda_powertools import Logger
from pytubefix import YouTube, exceptions
from timelength import TimeLength

logger = Logger()
videos = [f for f in os.listdir("failed")]

all_data = []
Expand All @@ -18,7 +20,7 @@
if os.path.isfile(f"manual_transcriptions/{video}"):
continue

print(f"Downloading video {video_id}")
logger.info("Downloading video", extra={"video_id": video_id})
video_url = f"https://www.youtube.com/watch?v={video_id}"
try:
audio_file = (
Expand All @@ -29,21 +31,21 @@
)
# KeyError: 'content-length'
except KeyError:
print(f"Failed obtaining audio for {video_id} (KeyError)")
logger.error("Failed obtaining audio (KeyError)", extra={"video_id": video_id})
continue
# kZB-Up9HnT4 is age restricted, and can't be accessed without logging in.
except exceptions.AgeRestrictedError:
print(f"Failed obtaining audio for {video_id} (AgeRestrictedError)")
logger.error("Failed obtaining audio (AgeRestrictedError)", extra={"video_id": video_id})
continue
# jys_9oreLA0 is a private video
except exceptions.VideoPrivate:
print(f"Failed obtaining audio for {video_id} (VideoPrivate)")
logger.error("Failed obtaining audio(VideoPrivate)", extra={"video_id": video_id})
continue
# EMb7n2q5qSc is streaming live and cannot be loaded
except exceptions.LiveStreamError:
print(f"Failed obtaining audio for {video_id} (LiveStreamError)")
logger.error("Failed obtaining audio (LiveStreamError)", extra={"video_id": video_id})
continue
print(f"Transcribing video {video_id}")
logger.info("Transcribing video", extra={"video_id": video_id})
whisper_model = whisper.load_model("medium")
# TODO: Try tweaking the patience and bean_size, eg. patience=2, beam_size=5
transcription = whisper_model.transcribe(audio_file, language="es")
Expand Down Expand Up @@ -79,7 +81,7 @@
if video_metadata.get("lengthSeconds"):
video_length_seconds = int(video_metadata["lengthSeconds"])
else:
print(f"Length not found for video {video_id}")
logger.error("Length not found", extra={"video_id": video_id})
video_length_seconds = None

video = {
Expand All @@ -102,4 +104,4 @@
json.dump(video, _file, indent=4)

os.remove(_video_metadata_file)
print(f"Wrote {processed_local_path}")
logger.info("Wrote file", extra={"processed_local_path": processed_local_path})

0 comments on commit 92d3445

Please sign in to comment.