Skip to content

Commit

Permalink
Merge pull request #378 from openzim/fix/differentiate-shorts-lives-n…
Browse files Browse the repository at this point in the history
…ormal-videos

Shorts, lives and long videos in the UI + fix shorts display
  • Loading branch information
benoit74 authored Nov 26, 2024
2 parents 234e6fa + 246f9e1 commit 7754caf
Show file tree
Hide file tree
Showing 21 changed files with 381 additions and 105 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed
### Changed

- Differentiate user uploaded shorts, lives & long videos (#367)

### Fixed

- Corrected the short video resolution in the UI (#366)
- Check for empty playlists after filtering, and after downloading videos (#375)

## [3.2.1] - 2024-11-01
Expand Down
8 changes: 8 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,11 @@ yarn test:e2e
```

On Linux, you might need to install additional dependencies, see [Linux Prerequisites](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites) in the Cypress documentation.

## running integration tests checking ZIM content

We have a bunch of integration tests checking ZIM content. Once you have a the test ZIM from openZIM channel (see instructions above for Vue.JS ZIM UI), you can run the tests locally as well:

```
ZIM_FILE_PATH="output/openZIM_testing.zim" pytest scraper/tests-integration/integration.py
```
8 changes: 3 additions & 5 deletions scraper/src/youtube2zim/playlists/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ def run(self):
(
playlists,
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand All @@ -106,10 +108,6 @@ def run(self):
shutil.rmtree(self.build_dir, ignore_errors=True)

for playlist in playlists:
if playlist.playlist_id == uploads_playlist_id:
logger.info(f"Skipping playlist {playlist.playlist_id} (uploads one)")
continue

logger.info(f"Executing youtube2zim for playlist {playlist.playlist_id}")
success, process = self.run_playlist_zim(playlist)
if success:
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/youtube2zim/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ class Channel(CamelModel):
profile_path: str | None = None
banner_path: str | None = None
joined_date: str
main_playlist: str | None = None
first_playlist: str | None = None
user_long_uploads_playlist: str | None = None
user_short_uploads_playlist: str | None = None
user_lives_playlist: str | None = None
playlist_count: int


Expand Down
66 changes: 24 additions & 42 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ def __init__(

# process-related
self.playlists = []
self.uploads_playlist_id = None
self.user_long_uploads_playlist_id = None
self.user_short_uploads_playlist_id = None
self.user_lives_playlist_id = None
self.videos_ids = []
self.video_ids_count = 0
self.videos_processed = 0
Expand Down Expand Up @@ -229,30 +231,6 @@ def banner_path(self):
def is_single_channel(self):
return len({pl.creator_id for pl in self.playlists}) == 1

@property
def sorted_playlists(self):
"""sorted list of playlists (by title) but with Uploads one at first if any"""
if len(self.playlists) <= 1:
return self.playlists

sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
index = 0
# make sure our Uploads, special playlist is first
if self.uploads_playlist_id:
try:
index = [
index
for index, p in enumerate(sorted_playlists)
if p.playlist_id == self.uploads_playlist_id
][-1]
except Exception:
index = 0
return (
[sorted_playlists[index]]
+ sorted_playlists[0:index]
+ sorted_playlists[index + 1 :]
)

def run(self):
"""execute the scraper step by step"""

Expand Down Expand Up @@ -552,7 +530,9 @@ def extract_playlists(self):
(
self.playlists,
self.main_channel_id,
self.uploads_playlist_id,
self.user_long_uploads_playlist_id,
self.user_short_uploads_playlist_id,
self.user_lives_playlist_id,
self.is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand Down Expand Up @@ -1045,6 +1025,7 @@ def generate_video_object(video) -> Video:
author = videos_channels[video_id]
subtitles_list = get_subtitles(video_id)
channel_data = get_channel_json(author["channelId"])

return Video(
id=video_id,
title=video["snippet"]["title"],
Expand Down Expand Up @@ -1151,10 +1132,13 @@ def get_playlist_slug(playlist) -> str:
)

# write playlists JSON files
playlist_list = []
home_playlist_list = []
playlist_list: list[PlaylistPreview] = []
home_playlist_list: list[Playlist] = []

user_long_uploads_playlist_slug = None
user_short_uploads_playlist_slug = None
user_lives_playlist_slug = None

main_playlist_slug = None
empty_playlists = list(
filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists)
)
Expand All @@ -1167,10 +1151,6 @@ def get_playlist_slug(playlist) -> str:
if len(self.playlists) == 0:
raise Exception("No playlist succeeded to download")

main_playlist_slug = get_playlist_slug(
self.playlists[0]
) # set first playlist as main playlist

for playlist in self.playlists:
playlist_slug = get_playlist_slug(playlist)
playlist_path = f"playlists/{playlist_slug}.json"
Expand All @@ -1195,16 +1175,15 @@ def get_playlist_slug(playlist) -> str:
# modify playlist object for preview on homepage
playlist_obj.videos = playlist_obj.videos[:12]

if playlist.playlist_id == self.uploads_playlist_id:
main_playlist_slug = (
playlist_slug # set uploads playlist as main playlist
)
# insert uploads playlist at the beginning of the list
playlist_list.insert(0, generate_playlist_preview_object(playlist))
home_playlist_list.insert(0, playlist_obj)
home_playlist_list.append(playlist_obj)
if playlist.playlist_id == self.user_long_uploads_playlist_id:
user_long_uploads_playlist_slug = playlist_slug
elif playlist.playlist_id == self.user_short_uploads_playlist_id:
user_short_uploads_playlist_slug = playlist_slug
elif playlist.playlist_id == self.user_lives_playlist_id:
user_lives_playlist_slug = playlist_slug
else:
playlist_list.append(generate_playlist_preview_object(playlist))
home_playlist_list.append(playlist_obj)

# write playlists.json file
self.zim_file.add_item_for(
Expand Down Expand Up @@ -1241,7 +1220,10 @@ def get_playlist_slug(playlist) -> str:
channel_description=channel_data["snippet"]["description"],
profile_path="profile.jpg",
banner_path="banner.jpg",
main_playlist=main_playlist_slug,
first_playlist=home_playlist_list[0].slug,
user_long_uploads_playlist=user_long_uploads_playlist_slug,
user_short_uploads_playlist=user_short_uploads_playlist_slug,
user_lives_playlist=user_lives_playlist_slug,
playlist_count=len(self.playlists),
joined_date=channel_data["snippet"]["publishedAt"],
).model_dump_json(by_alias=True, indent=2),
Expand Down
57 changes: 46 additions & 11 deletions scraper/src/youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def __init__(
@classmethod
def from_id(cls, playlist_id):
playlist_json = get_playlist_json(playlist_id)
if playlist_json is None:
raise PlaylistNotFoundError(
f"Invalid playlistId `{playlist_id}`: Not Found"
)
return Playlist(
playlist_id=playlist_id,
title=playlist_json["snippet"]["title"],
Expand Down Expand Up @@ -176,10 +180,13 @@ def get_playlist_json(playlist_id):
req.raise_for_status()
try:
playlist_json = req.json()["items"][0]
total_results = req.json().get("pageInfo", {}).get("totalResults", 0)
if total_results == 0:
logger.error(f"Playlist `{playlist_id}`: No Item Available")
return None
except IndexError:
raise PlaylistNotFoundError(
f"Invalid playlistId `{playlist_id}`: Not Found"
) from None
logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
return None
save_json(YOUTUBE.cache_dir, fname, playlist_json)
return playlist_json

Expand Down Expand Up @@ -336,8 +343,9 @@ def skip_outofrange_videos(date_range, item):
def extract_playlists_details_from(youtube_id: str):
"""prepare a list of Playlist from user request"""

uploads_playlist_id = None
main_channel_id = None
main_channel_id = user_long_uploads_playlist_id = user_short_uploads_playlist_id = (
user_lives_playlist_id
) = None
if "," not in youtube_id:
try:
# first try to consider passed ID is a channel ID (or username or handle)
Expand All @@ -347,11 +355,36 @@ def extract_playlists_details_from(youtube_id: str):
playlist_ids = [
p["id"] for p in get_channel_playlists_json(main_channel_id)
]
# we always include uploads playlist (contains everything)
playlist_ids += [
channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
]
uploads_playlist_id = playlist_ids[-1]

# Get special playlists JSON objects
user_long_uploads_json = get_playlist_json("UULF" + main_channel_id[2:])
user_short_uploads_json = get_playlist_json("UUSH" + main_channel_id[2:])
user_lives_json = get_playlist_json("UULV" + main_channel_id[2:])

# Extract special playlists IDs if the JSON objects are not None
user_long_uploads_playlist_id = (
user_long_uploads_json["id"] if user_long_uploads_json else None
)
user_short_uploads_playlist_id = (
user_short_uploads_json["id"] if user_short_uploads_json else None
)
user_lives_playlist_id = user_lives_json["id"] if user_lives_json else None

# Add special playlists if they exists, in proper order
playlist_ids = (
list(
filter(
None,
[
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
],
)
)
+ playlist_ids
)

is_playlist = False
except ChannelNotFoundError:
# channel not found, then ID should be a playlist
Expand All @@ -370,6 +403,8 @@ def extract_playlists_details_from(youtube_id: str):
# dict.fromkeys maintains the order of playlist_ids while removing duplicates
[Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
)
Loading

0 comments on commit 7754caf

Please sign in to comment.