Skip to content

Commit

Permalink
Clean up parse_item.py
Browse files Browse the repository at this point in the history
  • Loading branch information
justin025 committed Dec 12, 2024
1 parent 054c724 commit b2e4f27
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 427 deletions.
34 changes: 21 additions & 13 deletions src/onthespot/api/bandcamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,25 @@ def bandcamp_get_search_results(token, search_term, content_types):
return search_results


def bandcamp_get_album_data(url):
def bandcamp_get_album_track_ids(token, url):
logger.info(f"Getting tracks from album: {url}")
album_webpage = make_call(url, text=True, use_ssl=True)

matches = re.findall(r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>', album_webpage, re.DOTALL)
if matches:
for match in matches:
json_data_str = match
json_data_str = re.sub(r',\s*}', '}', json_data_str) # Remove trailing commas
album_data = json.loads(json_data_str)
for match in matches:
json_data_str = match
json_data_str = re.sub(r',\s*}', '}', json_data_str) # Remove trailing commas
album_data = json.loads(json_data_str)

return album_data
item_ids = []
for track in album_data.get('track', {}).get('itemListElement', []):
item_ids.append(track['item'].get('@id'))
return item_ids


def bandcamp_get_track_metadata(token, url):
track_webpage = make_call(url, text=True, use_ssl=True)
track_data = {}

matches = re.findall(r'data-(\w+)="(.*?)"', track_webpage)
for match in matches:
attribute_name, attribute_value = match
Expand All @@ -107,7 +109,12 @@ def bandcamp_get_track_metadata(token, url):
except json.JSONDecodeError:
track_data[attribute_name] = decoded_value

album_data = bandcamp_get_album_data(track_data['embed']['album_embed_data']['linkback'])
album_webpage = make_call(url, text=True, use_ssl=True)
matches = re.findall(r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>', album_webpage, re.DOTALL)
for match in matches:
json_data_str = match
json_data_str = re.sub(r',\s*}', '}', json_data_str) # Remove trailing commas
album_data = json.loads(json_data_str)

# Year
year = ''
Expand Down Expand Up @@ -145,14 +152,15 @@ def bandcamp_get_track_metadata(token, url):
info['is_playable'] = True
return info

def bandcamp_get_artist_albums(url):
def bandcamp_get_artist_album_ids(token, url):
logger.info(f"Getting album ids for artist: '{url}'")
root_url = re.match(r'^(https?://[^/]+)', url).group(1)
artist_webpage = make_call(url, text=True, use_ssl=True)

album_list = []
album_urls = []
matches = re.findall(r'<a\s+href=["\'](\/album[^"\']*)["\']', artist_webpage)
for href in matches:
full_url = f"{root_url}{href}"
album_list.append(full_url)
album_urls.append(full_url)

return album_list
return album_urls
38 changes: 24 additions & 14 deletions src/onthespot/api/deezer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,39 @@ def deezer_add_account(arl):
config.update()


def deezer_get_album_items(album_id):
def deezer_get_album_track_ids(token, album_id):
logger.info(f"Getting tracks from album: {album_id}")
album_data = make_call(f"{DEEZER_BASE}/album/{album_id}")
return album_data.get("tracks", {}).get("data", '')
item_ids = []
for track in album_data.get("tracks", {}).get("data", ''):
item_ids.append(track['id'])
return item_ids


def deezer_get_playlist_items(playlist_id):
def deezer_get_artist_album_ids(token, artist_id):
logger.info(f"Getting album ids for artist: '{artist_id}'")
album_data = make_call(f"{DEEZER_BASE}/artist/{artist_id}/albums")
item_ids = []
for album in album_data.get("data", ''):
item_ids.append(album.get("id", ''))
return item_ids


def deezer_get_playlist_track_ids(token, playlist_id):
logger.info(f"Getting items in playlist: '{playlist_id}'")
album_data = make_call(f"{DEEZER_BASE}/playlist/{playlist_id}")
return album_data.get("tracks", {}).get("data", '')
item_ids = []
for track in album_data.get("tracks", {}).get("data", ''):
item_ids.append(track['id'])
return item_ids


def deezer_get_playlist_data(playlist_id):
def deezer_get_playlist_data(token, playlist_id):
logger.info(f"Get playlist data for playlist: '{playlist_id}'")
playlist_data = make_call(f"{DEEZER_BASE}/playlist/{playlist_id}")
return playlist_data.get("title", ''), playlist_data.get("creator", {}).get("name", '')


def deezer_get_artist_albums(artist_id):
album_data = make_call(f"{DEEZER_BASE}/artist/{artist_id}/albums")

url_list = []
for album in album_data.get("data", ''):
url_list.append(album.get("link", ''))
return url_list


def deezer_get_track_metadata(token, item_id):
logger.info(f"Get track info for: '{item_id}'")

Expand Down Expand Up @@ -279,6 +288,7 @@ def deezer_login_user(account):
})
return False


def deezer_get_token(parsing_index):
return account_pool[config.get('parsing_acc_sn')]['login']['session']

Expand Down
1 change: 1 addition & 0 deletions src/onthespot/api/soundcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def soundcloud_get_search_results(token, search_term, content_types):


def soundcloud_get_set_items(token, url):
logger.info(f"Getting set items for {url}")
headers = {}
headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"

Expand Down
46 changes: 29 additions & 17 deletions src/onthespot/api/spotify.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,19 @@ def spotify_get_token(parsing_index):
return token


def spotify_get_artist_albums(token, artist_id):
logger.info(f"Get albums for artist by id '{artist_id}'")
def spotify_get_artist_album_ids(token, artist_id):
logger.info(f"Getting album ids for artist: '{artist_id}'")
headers = {"Authorization": f"Bearer {token}"}
resp = make_call(f'https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album%2Csingle&limit=50', headers=headers) #%2Cappears_on%2Ccompilation
return [resp['items'][i]['external_urls']['spotify'] for i in range(len(resp['items']))]
artist_data = make_call(f'https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album%2Csingle&limit=50', headers=headers) #%2Cappears_on%2Ccompilation

item_ids = []
for album in artist_data['items']:
item_ids.append(album['id'])
return item_ids


def spotify_get_playlist_data(token, playlist_id):
logger.info(f"Get playlist dump for '{playlist_id}'")
logger.info(f"Get playlist data for playlist: {playlist_id}")
headers = {"Authorization": f"Bearer {token}"}
resp = make_call(f'https://api.spotify.com/v1/playlists/{playlist_id}', headers=headers, skip_cache=True)
return resp['name'], resp['owner']['display_name']
Expand Down Expand Up @@ -432,9 +436,9 @@ def get_album_name(token, album_id):
resp['total_tracks']


def spotify_get_album_tracks(token, album_id):
logger.info(f"Get tracks from album by id '{album_id}'")
songs = []
def spotify_get_album_track_ids(token, album_id):
logger.info(f"Getting tracks from album: {album_id}")
tracks = []
offset = 0
limit = 50

Expand All @@ -444,11 +448,15 @@ def spotify_get_album_tracks(token, album_id):
resp = make_call(url, headers=headers)

offset += limit
songs.extend(resp['items'])
tracks.extend(resp['items'])

if resp['total'] <= offset:
break
return songs

item_ids = []
for track in tracks:
item_ids.append(track['id'])
return item_ids


def spotify_get_search_results(token, search_term, content_types):
Expand Down Expand Up @@ -603,14 +611,14 @@ def spotify_get_episode_metadata(token, episode_id):
logger.info(f"Get episode info for episode by id '{episode_id}'")
headers = {"Authorization": f"Bearer {token}"}
episode_data = make_call(f"https://api.spotify.com/v1/episodes/{episode_id}", headers=headers)
show_data = spotify_get_show_episodes(token, episode_data.get('show', {}).get('id', ''))
show_episode_ids = spotify_get_show_episode_ids(token, episode_data.get('show', {}).get('id', ''))
# I believe audiobook ids start with a 7 but to verify you can use https://api.spotify.com/v1/audiobooks/{id}
# the endpoint could possibly be used to mark audiobooks in genre but it doesn't really provide any additional
# metadata compared to show_data beyond abridged and unabridged.

track_number = ''
for index, episode in enumerate(show_data):
if episode['id'] == episode_id:
for index, episode in enumerate(show_episode_ids):
if episode == episode_id:
track_number = index + 1
break

Expand All @@ -627,7 +635,7 @@ def spotify_get_episode_metadata(token, episode_id):
info['track_number'] = track_number
# Not accurate
#info['total_tracks'] = episode_data.get('show', {}).get('total_episodes', 0)
info['total_tracks'] = len([episode for episode in show_data if episode])
info['total_tracks'] = len(show_episode_ids)
info['artists'] = conv_list_format([episode_data.get('show', {}).get('publisher', '')])
info['album_artists'] = conv_list_format([episode_data.get('show', {}).get('publisher', '')])
info['language'] = conv_list_format(episode_data.get('languages', []))
Expand All @@ -643,8 +651,8 @@ def spotify_get_episode_metadata(token, episode_id):
return info


def spotify_get_show_episodes(token, show_id):
logger.info(f"Get episodes for show by id '{show_id}'")
def spotify_get_show_episode_ids(token, show_id):
logger.info(f"Getting show episodes: {show_id}'")
episodes = []
offset = 0
limit = 50
Expand All @@ -659,4 +667,8 @@ def spotify_get_show_episodes(token, show_id):

if resp['total'] <= offset:
break
return episodes

item_ids = []
for episode in episodes:
item_ids.append(episode['id'])
return item_ids
Loading

0 comments on commit b2e4f27

Please sign in to comment.