-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyms.py
155 lines (142 loc) · 7.65 KB
/
pyms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
__version__ = "0.2.0"
__author__ = "Samuel Haidu"
__license__ = "MIT"
'''
Module for search music videos in youtube.com and get info in discogs.com
You can:
-Search videos and artists in youtube
-Get the top100 in youtube music
-Make a artist search in Discogs
-Get the albuns of artist(listed in Discogs from url)
-Get the tracks of album(listed in Discogs from url)
BASED IN HTTP REQUEST, its not a api. If the sites change your webpage format the script can't work
'''
from bs4 import BeautifulSoup
import requests
PARSER = 'html.parser'
def YTSearchVideos(query):
'''Search youtube videos and return the title, url, channel,
thumbnail and duration of video'''
query = query.replace(' ', '+')
webdata = requests.get('http://www.youtube.com/results?q='+query+'&sp=EgIQAVAU', verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
VideoList = []
for link in soupdata.findAll(attrs={'class':'yt-lockup-tile'}):
# Get info from HTML tags
if link.find('a').get('href')[0:36] == 'https://googleads.g.doubleclick.net/':continue
videolink = 'https://www.youtube.com' + link.find('a').get('href')
videotitle = link.find(attrs={'class':'yt-lockup-title'}).find('a').get('title')
try:
videoduration = link.find(attrs={'class':'yt-lockup-title'}).find('span').text[3:-1]
videoduration = videoduration.split()[1]
except:videoduration = '00:00'
try:thumbnailurl = link.find(attrs={'class':'yt-thumb-simple'}).find('img').get('src')
except:thumbnailurl = ''
try:channelname = link.find(attrs={'class':'yt-lockup-byline'}).find('a').text
except:channelname = ''
try: channelurl = 'https://www.youtube.com' + link.find(attrs={'class':'yt-lockup-byline'}).find('a').get('href')
except: channelurl = ''
VideoList.append({'title': videotitle, 'link': videolink,
'duration': videoduration, 'channelname': channelname,
'channelurl': channelurl, 'thumbnail': thumbnailurl})
return VideoList
def YTSearchMusicOfArtist(query):
''' Get the most famous music of artist from yotube if not found returns VideoList = []'''
query = query.replace(' ', '+')
webdata = requests.get("http://www.youtube.com/results?search_query=" + query, verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
VideoList = []
try:
for link in soupdata.findAll(attrs={'class':'watch-card'})[0].findAll(attrs={'class':'watch-card-main-col'}):
videolink = 'http://www.youtube.com/' + link.find('a').get('href')[:21]
videotitle = link.get('title')
VideoList.append({'title':videotitle, 'link':videolink})
return VideoList
except:
return VideoList
def getYTMusicTop():
''' Get the top 100 music on youtube '''
playlisturl = "http://www.youtube.com/playlist?list=PLFgquLnL59alcyTM2lkWJU34KtfPXQDaX"
webdata = requests.get(playlisturl, verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
VideoList = []
for link in soupdata.findAll(attrs={'class':'pl-video'}):
# Get info from HTML tags
videotitle = link.get('data-title')
videolink = 'http://www.youtube.com/watch?v=' + link.get('data-video-id')
videoduration = link.find(attrs={'class':'timestamp'}).text
thumbnailurl = link.find(attrs={'class':'yt-thumb-clip'}).find('img').get('data-thumb')
VideoList.append({'title': videotitle, 'link': videolink,
'duration': videoduration, 'thumbnail': thumbnailurl})
return VideoList
def artistSearch(query,limit=5):
''' Search artists in discogs.com and return name,
image url and url of artist '''
query = query.replace(' ', '+')
webdata = requests.get("http://www.discogs.com/search/?q=" + query + "&type=artist", verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
artists = []
countlimit = 0
for link in soupdata.findAll(attrs={'class':'card'}):
# Get info from HTML tags
url = 'http://www.discogs.com' + link.find('a').get('href')
name = link.find('h4').find('a').get('title')
imageurl = link.find('img').get('data-src')
artists.append({'name': name, 'url': url, 'image': imageurl})
countlimit += 1
if countlimit == limit:break
return artists
def getAlbunsFromArtist(artisturl):
''' Set the artist url from discogs and return
the master albuns from artist '''
webdata = requests.get(artisturl, verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
albuns = []
# Filter tags with have the class = card and master
for link in soupdata.findAll(attrs={'class':'card','class': 'master'}):
# Get info from HTML tags
name = link.find(attrs = {'class': 'title'}).find('a').text
url = 'http://www.discogs.com' + link.find(attrs = {'class': 'image'}).find('a').get('href')
artistname = link.find(attrs = {'class': 'artist'}).find('a').text
image = link.find(attrs = {'class': 'thumbnail_center'}).find('img').get('data-src')
year = link.find(attrs = {'class': 'year'}).text
country = link.find(attrs = {'class': 'country'}).find('span').text
recorderlistHTML = link.find(attrs = {'class': 'label'}).findAll('a') # Get a list of HTML tags with recorders info
recorders = ''
for i in recorderlistHTML:
recorders = recorders + i.text + ", "
# Make a list of dict with the albuns in discogs page
albuns.append({'name': name, 'url': url, 'artistname': artistname,
'image': image, 'year':year, 'country': country,
'recorder': recorders})
return albuns
def getTracksFromAlbum(albumurl):
''' Set the album url from discogs and return
the complete info from album '''
webdata = requests.get(albumurl, verify='cacert.pem').text
soupdata = BeautifulSoup(webdata, PARSER)
tracks = []
# Filter tag with have the class = playlist and after find tags that have class = tackslist_track
soupdataPlaylist = soupdata.find(attrs = {'class': 'playlist'}).findAll(attrs = {'class': 'tracklist_track'})
# This loop gets the tacklist
for link in soupdataPlaylist:
tracknum = link.get('data-track-position')
name = link.find(attrs = {'class': 'tracklist_track_title'}).text
duration = link.find(attrs = {'class': 'tracklist_track_duration'}).find('span').text
# Create a list of dict with name of track, number and duration
tracks.append({'name': name, 'tracknum': tracknum, 'duration': duration})
genlist = soupdata.find(attrs={'class': 'profile'}).findAll(attrs={'itemprop': 'genre'})[0].findAll('a')
stylelist = soupdata.find(attrs={'class': 'profile'}).findAll(attrs={'class': 'content'})[1].findAll('a')
generes = ''
styles = ''
for i in genlist: generes = generes + i.text + ', '
for i in stylelist: styles = styles + i.text + ', '
albumgenre = generes
albumstyle = styles
albumname = soupdata.find(attrs={'class': 'profile'}).find('h1').findAll('span')[1].find('a').text
albumartist = soupdata.find(attrs={'class': 'profile'}).find('h1').find('span').find('span').get('title')
albumyear = soupdata.find(attrs={'class': 'profile'}).findAll(attrs={'class': 'content'})[2].findAll('a')[0].text
coverurl = soupdata.find(attrs={'class': 'thumbnail_center'}).find('img').get('src')
tracks.append({'genre':albumgenre, 'style':albumstyle, 'albumname': albumname,
'year': albumyear, 'cover': coverurl}) # Create the last dict, with all info of album
return tracks