This repository has been archived by the owner on Jan 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcaprice_parser.py
132 lines (95 loc) · 3.21 KB
/
caprice_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from time import sleep
import requests
import re
import json
stations_regex = u"<a href=\"(.*?\.html)\" class=\"genres\">(.*?)<span><img.*?/>(.*?)</span></a>(<br>|</td>|<br><a)?"
station_pic_regex = '<img style="border: 1px solid #000000;" src="(.*?)" width="300" height="300" alt=""/>'
m3u_regex = "<a href=\"(.*\.m3u)\">"
ip_from_m3u_regex = "([a-z]{4,5}\://[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\:[0-9]+)"
cookie = {'beget': 'begetok'} # magic cookie the server waits for
base_page = 'http://radcap.ru/'
start_page = 'http://radcap.ru/'
resulting_json = []
# We parse main page. Then go to each station page and
# extract station picture, link to m3u playlist.
# And the last step - get contents of m3u playlist and parse IP
# address of the stream from it.
# parsing genres
genres_html = requests.get(start_page, cookies=cookie)
g_match = re.findall(stations_regex, genres_html.text, re.UNICODE)
skipper = 0 # dumb. used to avoid banning
for genre_match_item in g_match:
skipper += 1
# wait for a while to avoid banning.
if skipper % 5 == 0:
sleep(1)
genre_json = dict()
genre_json['url'] = '{}{}'.format(base_page, genre_match_item[0])
genre_json['name'] = u'{}'.format(genre_match_item[1])
print('Parsing ', genre_json['url'])
# get playlist m3u from station page
playlist_html = requests.get(genre_json['url'], cookies=cookie)
p_match = re.findall(m3u_regex, playlist_html.text)
pic_match = re.findall(station_pic_regex, playlist_html.text)
m3u_link = '{}{}'.format(base_page, p_match[0])
pic_link = '{}{}'.format(base_page, pic_match[0])
# now download the m3u and parse address
m3u_file_contents = requests.get(m3u_link, cookies=cookie)
ip_match = re.findall(ip_from_m3u_regex, m3u_file_contents.text)
stream_address = ip_match[0]
genre_json['pic_url'] = pic_link
genre_json['stream'] = stream_address
resulting_json.append(genre_json)
# How many stations to parse from each group
counters = [
10,
8,
14,
4,
28,
18,
47,
6,
50,
69,
8,
6,
16
]
# that is th Global list of station groups
names = ["ETHNIC / FOLK / SPIRITUAL MUSIC",
"CLASSICAL",
"BLUES",
"COUNTRY",
"JAZZ",
"POP MUSIC",
"ROCK",
"REGGAE / SKA",
"METAL / HARDCORE",
"ELECTRONIC MUSIC",
"RAP / HIP-HOP",
u"ШАНСОН",
"MISCELLANEOUS"
]
i = 0
global_i = 0
current_count = counters[0]
current_counter_index = 0
final_json = []
# filling every global group with stations parsed
for x in names:
final_record = dict()
current_count = counters[current_counter_index]
print('Group name:', x)
final_record['genre'] = x
final_stations = list()
for station_index in range(global_i, global_i+current_count):
print('Current station: ', resulting_json[station_index]['name'])
final_stations.append(resulting_json[station_index])
final_record['stations'] = final_stations
final_json.append(final_record)
global_i += current_count
current_counter_index += 1
# saving the result
with open("result.json", 'w', encoding='utf-8') as f:
f.write(json.dumps(final_json))