Clean up parse_item.py

justin025 · Dec 12, 2024 · b2e4f27 · b2e4f27
1 parent 054c724
commit b2e4f27
Show file tree

Hide file tree

Showing 7 changed files with 284 additions and 427 deletions.
diff --git a/src/onthespot/api/bandcamp.py b/src/onthespot/api/bandcamp.py
@@ -78,23 +78,25 @@ def bandcamp_get_search_results(token, search_term, content_types):
     return search_results
 
 
-def bandcamp_get_album_data(url):
+def bandcamp_get_album_track_ids(token, url):
+    logger.info(f"Getting tracks from album: {url}")
     album_webpage = make_call(url, text=True, use_ssl=True)
 
     matches = re.findall(r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>', album_webpage, re.DOTALL)
-    if matches:
-        for match in matches:
-            json_data_str = match
-            json_data_str = re.sub(r',\s*}', '}', json_data_str)  # Remove trailing commas
-            album_data = json.loads(json_data_str)
+    for match in matches:
+        json_data_str = match
+        json_data_str = re.sub(r',\s*}', '}', json_data_str)  # Remove trailing commas
+        album_data = json.loads(json_data_str)
 
-            return album_data
+        item_ids = []
+        for track in album_data.get('track', {}).get('itemListElement', []):
+            item_ids.append(track['item'].get('@id'))
+        return item_ids
 
 
 def bandcamp_get_track_metadata(token, url):
     track_webpage = make_call(url, text=True, use_ssl=True)
     track_data = {}
-
     matches = re.findall(r'data-(\w+)="(.*?)"', track_webpage)
     for match in matches:
         attribute_name, attribute_value = match
@@ -107,7 +109,12 @@ def bandcamp_get_track_metadata(token, url):
         except json.JSONDecodeError:
             track_data[attribute_name] = decoded_value
 
-    album_data = bandcamp_get_album_data(track_data['embed']['album_embed_data']['linkback'])
+    album_webpage = make_call(url, text=True, use_ssl=True)
+    matches = re.findall(r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>', album_webpage, re.DOTALL)
+    for match in matches:
+        json_data_str = match
+        json_data_str = re.sub(r',\s*}', '}', json_data_str)  # Remove trailing commas
+        album_data = json.loads(json_data_str)
 
     # Year
     year = ''
@@ -145,14 +152,15 @@ def bandcamp_get_track_metadata(token, url):
     info['is_playable'] = True
     return info
 
-def bandcamp_get_artist_albums(url):
+def bandcamp_get_artist_album_ids(token, url):
+    logger.info(f"Getting album ids for artist: '{url}'")
     root_url = re.match(r'^(https?://[^/]+)', url).group(1)
     artist_webpage = make_call(url, text=True, use_ssl=True)
 
-    album_list = []
+    album_urls = []
     matches = re.findall(r'<a\s+href=["\'](\/album[^"\']*)["\']', artist_webpage)
     for href in matches:
         full_url = f"{root_url}{href}"
-        album_list.append(full_url)
+        album_urls.append(full_url)
 
-    return album_list
+    return album_urls
diff --git a/src/onthespot/api/deezer.py b/src/onthespot/api/deezer.py
@@ -50,30 +50,39 @@ def deezer_add_account(arl):
     config.update()
 
 
-def deezer_get_album_items(album_id):
+def deezer_get_album_track_ids(token, album_id):
+    logger.info(f"Getting tracks from album: {album_id}")
     album_data = make_call(f"{DEEZER_BASE}/album/{album_id}")
-    return album_data.get("tracks", {}).get("data", '')
+    item_ids = []
+    for track in album_data.get("tracks", {}).get("data", ''):
+        item_ids.append(track['id'])
+    return item_ids
 
 
-def deezer_get_playlist_items(playlist_id):
+def deezer_get_artist_album_ids(token, artist_id):
+    logger.info(f"Getting album ids for artist: '{artist_id}'")
+    album_data = make_call(f"{DEEZER_BASE}/artist/{artist_id}/albums")
+    item_ids = []
+    for album in album_data.get("data", ''):
+        item_ids.append(album.get("id", ''))
+    return item_ids
+
+
+def deezer_get_playlist_track_ids(token, playlist_id):
+    logger.info(f"Getting items in playlist: '{playlist_id}'")
     album_data = make_call(f"{DEEZER_BASE}/playlist/{playlist_id}")
-    return album_data.get("tracks", {}).get("data", '')
+    item_ids = []
+    for track in album_data.get("tracks", {}).get("data", ''):
+        item_ids.append(track['id'])
+    return item_ids
 
 
-def deezer_get_playlist_data(playlist_id):
+def deezer_get_playlist_data(token, playlist_id):
+    logger.info(f"Get playlist data for playlist: '{playlist_id}'")
     playlist_data = make_call(f"{DEEZER_BASE}/playlist/{playlist_id}")
     return playlist_data.get("title", ''), playlist_data.get("creator", {}).get("name", '')
 
 
-def deezer_get_artist_albums(artist_id):
-    album_data = make_call(f"{DEEZER_BASE}/artist/{artist_id}/albums")
-
-    url_list = []
-    for album in album_data.get("data", ''):
-        url_list.append(album.get("link", ''))
-    return url_list
-
-
 def deezer_get_track_metadata(token, item_id):
     logger.info(f"Get track info for: '{item_id}'")
 
@@ -279,6 +288,7 @@ def deezer_login_user(account):
         })
         return False
 
+
 def deezer_get_token(parsing_index):
     return account_pool[config.get('parsing_acc_sn')]['login']['session']
 

diff --git a/src/onthespot/api/soundcloud.py b/src/onthespot/api/soundcloud.py
@@ -176,6 +176,7 @@ def soundcloud_get_search_results(token, search_term, content_types):
 
 
 def soundcloud_get_set_items(token, url):
+    logger.info(f"Getting set items for {url}")
     headers = {}
     headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
 

diff --git a/src/onthespot/api/spotify.py b/src/onthespot/api/spotify.py
@@ -243,15 +243,19 @@ def spotify_get_token(parsing_index):
     return token
 
 
-def spotify_get_artist_albums(token, artist_id):
-    logger.info(f"Get albums for artist by id '{artist_id}'")
+def spotify_get_artist_album_ids(token, artist_id):
+    logger.info(f"Getting album ids for artist: '{artist_id}'")
     headers = {"Authorization": f"Bearer {token}"}
-    resp = make_call(f'https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album%2Csingle&limit=50', headers=headers) #%2Cappears_on%2Ccompilation
-    return [resp['items'][i]['external_urls']['spotify'] for i in range(len(resp['items']))]
+    artist_data = make_call(f'https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album%2Csingle&limit=50', headers=headers) #%2Cappears_on%2Ccompilation
+
+    item_ids = []
+    for album in artist_data['items']:
+        item_ids.append(album['id'])
+    return item_ids
 
 
 def spotify_get_playlist_data(token, playlist_id):
-    logger.info(f"Get playlist dump for '{playlist_id}'")
+    logger.info(f"Get playlist data for playlist: {playlist_id}")
     headers = {"Authorization": f"Bearer {token}"}
     resp = make_call(f'https://api.spotify.com/v1/playlists/{playlist_id}', headers=headers, skip_cache=True)
     return resp['name'], resp['owner']['display_name']
@@ -432,9 +436,9 @@ def get_album_name(token, album_id):
             resp['total_tracks']
 
 
-def spotify_get_album_tracks(token, album_id):
-    logger.info(f"Get tracks from album by id '{album_id}'")
-    songs = []
+def spotify_get_album_track_ids(token, album_id):
+    logger.info(f"Getting tracks from album: {album_id}")
+    tracks = []
     offset = 0
     limit = 50
 
@@ -444,11 +448,15 @@ def spotify_get_album_tracks(token, album_id):
         resp = make_call(url, headers=headers)
 
         offset += limit
-        songs.extend(resp['items'])
+        tracks.extend(resp['items'])
 
         if resp['total'] <= offset:
             break
-    return songs
+
+    item_ids = []
+    for track in tracks:
+        item_ids.append(track['id'])
+    return item_ids
 
 
 def spotify_get_search_results(token, search_term, content_types):
@@ -603,14 +611,14 @@ def spotify_get_episode_metadata(token, episode_id):
     logger.info(f"Get episode info for episode by id '{episode_id}'")
     headers = {"Authorization": f"Bearer {token}"}
     episode_data = make_call(f"https://api.spotify.com/v1/episodes/{episode_id}", headers=headers)
-    show_data = spotify_get_show_episodes(token, episode_data.get('show', {}).get('id', ''))
+    show_episode_ids = spotify_get_show_episode_ids(token, episode_data.get('show', {}).get('id', ''))
     # I believe audiobook ids start with a 7 but to verify you can use https://api.spotify.com/v1/audiobooks/{id}
     # the endpoint could possibly be used to mark audiobooks in genre but it doesn't really provide any additional
     # metadata compared to show_data beyond abridged and unabridged.
 
     track_number = ''
-    for index, episode in enumerate(show_data):
-        if episode['id'] == episode_id:
+    for index, episode in enumerate(show_episode_ids):
+        if episode == episode_id:
             track_number = index + 1
             break
 
@@ -627,7 +635,7 @@ def spotify_get_episode_metadata(token, episode_id):
     info['track_number'] = track_number
     # Not accurate
     #info['total_tracks'] = episode_data.get('show', {}).get('total_episodes', 0)
-    info['total_tracks'] = len([episode for episode in show_data if episode])
+    info['total_tracks'] = len(show_episode_ids)
     info['artists'] = conv_list_format([episode_data.get('show', {}).get('publisher', '')])
     info['album_artists'] = conv_list_format([episode_data.get('show', {}).get('publisher', '')])
     info['language'] = conv_list_format(episode_data.get('languages', []))
@@ -643,8 +651,8 @@ def spotify_get_episode_metadata(token, episode_id):
     return info
 
 
-def spotify_get_show_episodes(token, show_id):
-    logger.info(f"Get episodes for show by id '{show_id}'")
+def spotify_get_show_episode_ids(token, show_id):
+    logger.info(f"Getting show episodes: {show_id}'")
     episodes = []
     offset = 0
     limit = 50
@@ -659,4 +667,8 @@ def spotify_get_show_episodes(token, show_id):
 
         if resp['total'] <= offset:
             break
-    return episodes
+
+    item_ids = []
+    for episode in episodes:
+        item_ids.append(episode['id'])
+    return item_ids