Update channel id for Sheinbaum, use logging instead of print

ivansabik · Oct 18, 2024 · 92d3445 · 92d3445
1 parent cf3f990
commit 92d3445
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 22 deletions.
diff --git a/process.py b/process.py
@@ -1,17 +1,23 @@
 import datetime
 import json
 import os
+import xml
 
 import requests
 import scrapetube
+from aws_lambda_powertools import Logger
 from timelength import TimeLength
 from youtube_transcript_api import YouTubeTranscriptApi, _errors
 
+AMLO_CHANNEL_ID = "UCxEgOKuI-n-WOJaNcisHvSg"
+SHEINBAUM_CHANNEL_ID = "UCvzHrtf9by1-UY67SfZse8w"
 
-def handler():
+logger = Logger()
+
+
+def handler(channel_id):
     # Get a list of all playlists in the channel
     api_key = os.environ["YOUTUBE_V3_API_KEY"]
-    channel_id = "UCxEgOKuI-n-WOJaNcisHvSg"
     playlists = requests.get(
         "https://www.googleapis.com/youtube/v3/playlists",
         params={
@@ -47,27 +53,32 @@ def _process_video(video_metadata):
     video_id = video_metadata["videoId"]
 
     # Check if video has already been processed
-    processed_local_path = f"data/{video_id}.json"
-    if os.path.isfile(processed_local_path):
-        return
-    failed_path = f"failed/{video_id}.json"
-    if os.path.isfile(failed_path):
-        return
+    for path in ["data", "failed"]:
+        local_path = f"{path}/{video_id}.json"
+        if os.path.isfile(local_path):
+            logger.info("File already exists", extra={"local_path": local_path})
+            return
 
     # Retrieve or generate transcriptions
+    failed_path = f"failed/{video_id}.json"
+    logger.info("Obtaining transcriptions", extra={"video_id": video_id})
     try:
         transcription_with_timestamps = YouTubeTranscriptApi.get_transcript(
             video_id, languages=["es"]
         )
     except _errors.TranscriptsDisabled:
-        print(f"Transcripts are disabled for video {video_id}")
+        logger.warning("Transcripts are disabled", extra={"video_id": video_id})
         with open(failed_path, "w") as _file:
             json.dump(video_metadata, _file, indent=4)
         return
+    # See https://github.com/jdepoix/youtube-transcript-api/issues/320
+    except xml.etree.ElementTree.ParseError:
+        logger.warning("Retrieving transcript failed", extra={"video_id": video_id})
+        return
     # Language for some videos is not Spanish - ES
     # Example: https://www.youtube.com/watch?v=k_rBgKb1y8U
     except _errors.NoTranscriptFound:
-        print(f"No transcript available for video {video_id}")
+        logger.warning("No transcript available", extra={"video_id": video_id})
         with open(failed_path, "w") as _file:
             json.dump(video_metadata, _file, indent=4)
         return
@@ -81,7 +92,9 @@ def _process_video(video_metadata):
     if not video_metadata.get("videoInfo"):
         published_time_text = video_metadata["publishedTimeText"]["simpleText"]
         video_length = video_metadata["lengthText"]["accessibility"]["accessibilityData"]["label"]
-        video_length_seconds = TimeLength(video_length).total_seconds
+        video_length_seconds = TimeLength(video_length)
+        assert video_length_seconds.result.success
+        video_length_seconds = video_length_seconds.result.seconds
         video_length_seconds = int(video_length_seconds)
     else:
         published_time_text = video_metadata["videoInfo"]["runs"][-1]["text"]
@@ -99,13 +112,17 @@ def _process_video(video_metadata):
         "playlist_id": video_metadata["playlist_id"],
         "playlist_title": video_metadata["playlist_title"],
         "published_time_text": published_time_text,
-        "retrieved_time": str(datetime.datetime.utcnow()),
+        "retrieved_time": str(datetime.datetime.now(datetime.timezone.utc)),
     }
-    with open(processed_local_path, "w") as _file:
+    with open(local_path, "w") as _file:
         json.dump(video, _file, indent=4)
 
     return video
 
 
 if __name__ == "__main__":
-    handler()
+    if os.getenv("AMLO"):
+        channel_id = AMLO_CHANNEL_ID
+    else:
+        channel_id = SHEINBAUM_CHANNEL_ID
+    handler(channel_id)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+aws-lambda-powertools
 ffmpeg-python
 openai-whisper
 pandas

diff --git a/transcribe.py b/transcribe.py
@@ -3,9 +3,11 @@
 import os
 
 import whisper
+from aws_lambda_powertools import Logger
 from pytubefix import YouTube, exceptions
 from timelength import TimeLength
 
+logger = Logger()
 videos = [f for f in os.listdir("failed")]
 
 all_data = []
@@ -18,7 +20,7 @@
     if os.path.isfile(f"manual_transcriptions/{video}"):
         continue
 
-    print(f"Downloading video {video_id}")
+    logger.info("Downloading video", extra={"video_id": video_id})
     video_url = f"https://www.youtube.com/watch?v={video_id}"
     try:
         audio_file = (
@@ -29,21 +31,21 @@
         )
     # KeyError: 'content-length'
     except KeyError:
-        print(f"Failed obtaining audio for {video_id} (KeyError)")
+        logger.error("Failed obtaining audio (KeyError)", extra={"video_id": video_id})
         continue
     # kZB-Up9HnT4 is age restricted, and can't be accessed without logging in.
     except exceptions.AgeRestrictedError:
-        print(f"Failed obtaining audio for {video_id} (AgeRestrictedError)")
+        logger.error("Failed obtaining audio (AgeRestrictedError)", extra={"video_id": video_id})
         continue
     # jys_9oreLA0 is a private video
     except exceptions.VideoPrivate:
-        print(f"Failed obtaining audio for {video_id} (VideoPrivate)")
+        logger.error("Failed obtaining audio(VideoPrivate)", extra={"video_id": video_id})
         continue
     # EMb7n2q5qSc is streaming live and cannot be loaded
     except exceptions.LiveStreamError:
-        print(f"Failed obtaining audio for {video_id} (LiveStreamError)")
+        logger.error("Failed obtaining audio (LiveStreamError)", extra={"video_id": video_id})
         continue
-    print(f"Transcribing video {video_id}")
+    logger.info("Transcribing video", extra={"video_id": video_id})
     whisper_model = whisper.load_model("medium")
     # TODO: Try tweaking the patience and bean_size, eg. patience=2, beam_size=5
     transcription = whisper_model.transcribe(audio_file, language="es")
@@ -79,7 +81,7 @@
         if video_metadata.get("lengthSeconds"):
             video_length_seconds = int(video_metadata["lengthSeconds"])
         else:
-            print(f"Length not found for video {video_id}")
+            logger.error("Length not found", extra={"video_id": video_id})
             video_length_seconds = None
 
     video = {
@@ -102,4 +104,4 @@
         json.dump(video, _file, indent=4)
 
     os.remove(_video_metadata_file)
-    print(f"Wrote {processed_local_path}")
+    logger.info("Wrote file", extra={"processed_local_path": processed_local_path})