From 326187d8a8388e47e881a0877ab6eef2e66386d5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 14 Oct 2024 09:52:15 +0000 Subject: [PATCH] Filter-out non-public videos and properly cleanup unsuccessful videos --- CHANGELOG.md | 4 ++++ scraper/src/youtube2zim/scraper.py | 9 +++++---- scraper/src/youtube2zim/youtube.py | 7 ++++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ed95834..fd4ff0b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Raise exception if there are no videos in the playlists (#347) +### Fixed + +- Filter-out non-public videos and properly cleanup unsuccessful videos (#362) + ## [3.2.0] - 2024-10-11 ### Deprecated diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index 0c76e4f9..288de24b 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -82,6 +82,7 @@ get_videos_json, save_channel_branding, skip_deleted_videos, + skip_non_public_videos, skip_outofrange_videos, ) @@ -611,6 +612,7 @@ def extract_videos_list(self): ) filter_videos = filter(skip_outofrange, videos_json) filter_videos = filter(skip_deleted_videos, filter_videos) + filter_videos = filter(skip_non_public_videos, filter_videos) all_videos.update( {v["contentDetails"]["videoId"]: v for v in filter_videos} ) @@ -1038,10 +1040,9 @@ def update_metadata(self): def make_json_files(self, actual_videos_ids): """Generate JSON files to be consumed by the frontend""" - def remove_unused_videos(videos): - video_ids = [video["contentDetails"]["videoId"] for video in videos] + def remove_unused_videos(): for path in self.videos_dir.iterdir(): - if path.is_dir() and path.name not in video_ids: + if path.is_dir() and path.name not in actual_videos_ids: logger.debug(f"Removing unused video {path.name}") shutil.rmtree(path, ignore_errors=True) @@ -1282,7 +1283,7 @@ def get_playlist_slug(playlist) -> str: ) # clean videos left out in videos directory - remove_unused_videos(videos) + remove_unused_videos() def add_file_to_zim( self, diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py index 54a0f6ed..8eb17b09 100644 --- a/scraper/src/youtube2zim/youtube.py +++ b/scraper/src/youtube2zim/youtube.py @@ -190,7 +190,7 @@ def get_videos_json(playlist_id): PLAYLIST_ITEMS_API, params={ "playlistId": playlist_id, - "part": "snippet,contentDetails", + "part": "snippet,contentDetails,status", "key": YOUTUBE.api_key, "maxResults": RESULTS_PER_PAGE, "pageToken": page_token, @@ -309,6 +309,11 @@ def skip_deleted_videos(item): ) +def skip_non_public_videos(item): + """filter func to filter-out non-public videos""" + return item["status"]["privacyStatus"] == "public" + + def skip_outofrange_videos(date_range, item): """filter func to filter-out videos that are not within specified date range""" return dt_parser.parse(item["snippet"]["publishedAt"]).date() in date_range