Merge pull request #378 from openzim/fix/differentiate-shorts-lives-n…

…ormal-videos Shorts, lives and long videos in the UI + fix shorts display
openzim · Nov 26, 2024 · 7754caf · 7754caf
2 parents 234e6fa + 246f9e1
commit 7754caf
Show file tree

Hide file tree

Showing 21 changed files with 381 additions and 105 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-### Fixed
+### Changed
+
+- Differentiate user uploaded shorts, lives & long videos (#367)
+
+### Fixed
 
+- Corrected the short video resolution in the UI (#366)
 - Check for empty playlists after filtering, and after downloading videos (#375)
 
 ## [3.2.1] - 2024-11-01

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -101,3 +101,11 @@ yarn test:e2e
 ```
 
 On Linux, you might need to install additional dependencies, see [Linux Prerequisites](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites) in the Cypress documentation.
+
+## running integration tests checking ZIM content
+
+We have a bunch of integration tests checking ZIM content. Once you have a the test ZIM from openZIM channel (see instructions above for Vue.JS ZIM UI), you can run the tests locally as well:
+
+```
+ZIM_FILE_PATH="output/openZIM_testing.zim" pytest scraper/tests-integration/integration.py
+```
diff --git a/scraper/src/youtube2zim/playlists/scraper.py b/scraper/src/youtube2zim/playlists/scraper.py
@@ -91,7 +91,9 @@ def run(self):
         (
             playlists,
             main_channel_id,
-            uploads_playlist_id,
+            user_long_uploads_playlist_id,
+            user_short_uploads_playlist_id,
+            user_lives_playlist_id,
             is_playlist,
         ) = extract_playlists_details_from(self.youtube_id)
 
@@ -106,10 +108,6 @@ def run(self):
         shutil.rmtree(self.build_dir, ignore_errors=True)
 
         for playlist in playlists:
-            if playlist.playlist_id == uploads_playlist_id:
-                logger.info(f"Skipping playlist {playlist.playlist_id} (uploads one)")
-                continue
-
             logger.info(f"Executing youtube2zim for playlist {playlist.playlist_id}")
             success, process = self.run_playlist_zim(playlist)
             if success:

diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py
@@ -105,7 +105,10 @@ class Channel(CamelModel):
     profile_path: str | None = None
     banner_path: str | None = None
     joined_date: str
-    main_playlist: str | None = None
+    first_playlist: str | None = None
+    user_long_uploads_playlist: str | None = None
+    user_short_uploads_playlist: str | None = None
+    user_lives_playlist: str | None = None
     playlist_count: int
 
 

diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py
@@ -170,7 +170,9 @@ def __init__(
 
         # process-related
         self.playlists = []
-        self.uploads_playlist_id = None
+        self.user_long_uploads_playlist_id = None
+        self.user_short_uploads_playlist_id = None
+        self.user_lives_playlist_id = None
         self.videos_ids = []
         self.video_ids_count = 0
         self.videos_processed = 0
@@ -229,30 +231,6 @@ def banner_path(self):
     def is_single_channel(self):
         return len({pl.creator_id for pl in self.playlists}) == 1
 
-    @property
-    def sorted_playlists(self):
-        """sorted list of playlists (by title) but with Uploads one at first if any"""
-        if len(self.playlists) <= 1:
-            return self.playlists
-
-        sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
-        index = 0
-        # make sure our Uploads, special playlist is first
-        if self.uploads_playlist_id:
-            try:
-                index = [
-                    index
-                    for index, p in enumerate(sorted_playlists)
-                    if p.playlist_id == self.uploads_playlist_id
-                ][-1]
-            except Exception:
-                index = 0
-        return (
-            [sorted_playlists[index]]
-            + sorted_playlists[0:index]
-            + sorted_playlists[index + 1 :]
-        )
-
     def run(self):
         """execute the scraper step by step"""
 
@@ -552,7 +530,9 @@ def extract_playlists(self):
         (
             self.playlists,
             self.main_channel_id,
-            self.uploads_playlist_id,
+            self.user_long_uploads_playlist_id,
+            self.user_short_uploads_playlist_id,
+            self.user_lives_playlist_id,
             self.is_playlist,
         ) = extract_playlists_details_from(self.youtube_id)
 
@@ -1045,6 +1025,7 @@ def generate_video_object(video) -> Video:
             author = videos_channels[video_id]
             subtitles_list = get_subtitles(video_id)
             channel_data = get_channel_json(author["channelId"])
+
             return Video(
                 id=video_id,
                 title=video["snippet"]["title"],
@@ -1151,10 +1132,13 @@ def get_playlist_slug(playlist) -> str:
             )
 
         # write playlists JSON files
-        playlist_list = []
-        home_playlist_list = []
+        playlist_list: list[PlaylistPreview] = []
+        home_playlist_list: list[Playlist] = []
+
+        user_long_uploads_playlist_slug = None
+        user_short_uploads_playlist_slug = None
+        user_lives_playlist_slug = None
 
-        main_playlist_slug = None
         empty_playlists = list(
             filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists)
         )
@@ -1167,10 +1151,6 @@ def get_playlist_slug(playlist) -> str:
         if len(self.playlists) == 0:
             raise Exception("No playlist succeeded to download")
 
-        main_playlist_slug = get_playlist_slug(
-            self.playlists[0]
-        )  # set first playlist as main playlist
-
         for playlist in self.playlists:
             playlist_slug = get_playlist_slug(playlist)
             playlist_path = f"playlists/{playlist_slug}.json"
@@ -1195,16 +1175,15 @@ def get_playlist_slug(playlist) -> str:
             # modify playlist object for preview on homepage
             playlist_obj.videos = playlist_obj.videos[:12]
 
-            if playlist.playlist_id == self.uploads_playlist_id:
-                main_playlist_slug = (
-                    playlist_slug  # set uploads playlist as main playlist
-                )
-                # insert uploads playlist at the beginning of the list
-                playlist_list.insert(0, generate_playlist_preview_object(playlist))
-                home_playlist_list.insert(0, playlist_obj)
+            home_playlist_list.append(playlist_obj)
+            if playlist.playlist_id == self.user_long_uploads_playlist_id:
+                user_long_uploads_playlist_slug = playlist_slug
+            elif playlist.playlist_id == self.user_short_uploads_playlist_id:
+                user_short_uploads_playlist_slug = playlist_slug
+            elif playlist.playlist_id == self.user_lives_playlist_id:
+                user_lives_playlist_slug = playlist_slug
             else:
                 playlist_list.append(generate_playlist_preview_object(playlist))
-                home_playlist_list.append(playlist_obj)
 
         # write playlists.json file
         self.zim_file.add_item_for(
@@ -1241,7 +1220,10 @@ def get_playlist_slug(playlist) -> str:
                 channel_description=channel_data["snippet"]["description"],
                 profile_path="profile.jpg",
                 banner_path="banner.jpg",
-                main_playlist=main_playlist_slug,
+                first_playlist=home_playlist_list[0].slug,
+                user_long_uploads_playlist=user_long_uploads_playlist_slug,
+                user_short_uploads_playlist=user_short_uploads_playlist_slug,
+                user_lives_playlist=user_lives_playlist_slug,
                 playlist_count=len(self.playlists),
                 joined_date=channel_data["snippet"]["publishedAt"],
             ).model_dump_json(by_alias=True, indent=2),

diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py
@@ -56,6 +56,10 @@ def __init__(
     @classmethod
     def from_id(cls, playlist_id):
         playlist_json = get_playlist_json(playlist_id)
+        if playlist_json is None:
+            raise PlaylistNotFoundError(
+                f"Invalid playlistId `{playlist_id}`: Not Found"
+            )
         return Playlist(
             playlist_id=playlist_id,
             title=playlist_json["snippet"]["title"],
@@ -176,10 +180,13 @@ def get_playlist_json(playlist_id):
         req.raise_for_status()
         try:
             playlist_json = req.json()["items"][0]
+            total_results = req.json().get("pageInfo", {}).get("totalResults", 0)
+            if total_results == 0:
+                logger.error(f"Playlist `{playlist_id}`: No Item Available")
+                return None
         except IndexError:
-            raise PlaylistNotFoundError(
-                f"Invalid playlistId `{playlist_id}`: Not Found"
-            ) from None
+            logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
+            return None
         save_json(YOUTUBE.cache_dir, fname, playlist_json)
     return playlist_json
 
@@ -336,8 +343,9 @@ def skip_outofrange_videos(date_range, item):
 def extract_playlists_details_from(youtube_id: str):
     """prepare a list of Playlist from user request"""
 
-    uploads_playlist_id = None
-    main_channel_id = None
+    main_channel_id = user_long_uploads_playlist_id = user_short_uploads_playlist_id = (
+        user_lives_playlist_id
+    ) = None
     if "," not in youtube_id:
         try:
             # first try to consider passed ID is a channel ID (or username or handle)
@@ -347,11 +355,36 @@ def extract_playlists_details_from(youtube_id: str):
             playlist_ids = [
                 p["id"] for p in get_channel_playlists_json(main_channel_id)
             ]
-            # we always include uploads playlist (contains everything)
-            playlist_ids += [
-                channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
-            ]
-            uploads_playlist_id = playlist_ids[-1]
+
+            # Get special playlists JSON objects
+            user_long_uploads_json = get_playlist_json("UULF" + main_channel_id[2:])
+            user_short_uploads_json = get_playlist_json("UUSH" + main_channel_id[2:])
+            user_lives_json = get_playlist_json("UULV" + main_channel_id[2:])
+
+            # Extract special playlists IDs if the JSON objects are not None
+            user_long_uploads_playlist_id = (
+                user_long_uploads_json["id"] if user_long_uploads_json else None
+            )
+            user_short_uploads_playlist_id = (
+                user_short_uploads_json["id"] if user_short_uploads_json else None
+            )
+            user_lives_playlist_id = user_lives_json["id"] if user_lives_json else None
+
+            # Add special playlists if they exists, in proper order
+            playlist_ids = (
+                list(
+                    filter(
+                        None,
+                        [
+                            user_long_uploads_playlist_id,
+                            user_short_uploads_playlist_id,
+                            user_lives_playlist_id,
+                        ],
+                    )
+                )
+                + playlist_ids
+            )
+
             is_playlist = False
         except ChannelNotFoundError:
             # channel not found, then ID should be a playlist
@@ -370,6 +403,8 @@ def extract_playlists_details_from(youtube_id: str):
         # dict.fromkeys maintains the order of playlist_ids while removing duplicates
         [Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
         main_channel_id,
-        uploads_playlist_id,
+        user_long_uploads_playlist_id,
+        user_short_uploads_playlist_id,
+        user_lives_playlist_id,
         is_playlist,
     )