forked from Nudin/mpv-script-directory
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapewiki.py
executable file
·119 lines (102 loc) · 3.55 KB
/
scrapewiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
import json
import re
from pprint import pprint
import requests
from bs4 import BeautifulSoup
from querystars import re_gist, re_github, re_gitlab, updatestars
page = requests.get("https://github.com/mpv-player/mpv/wiki/User-Scripts")
soup = BeautifulSoup(page.content, "html.parser")
elements = soup.find(id="wiki-body").select("li, h2")
re_windows = re.compile(r"\b[wW]indows\b")
re_linux = re.compile(r"\b[lL]inux\b")
re_mac = re.compile(r"\bmac(os|\b)", re.IGNORECASE)
re_unix = re.compile(r"\*nix|Unix")
re_proto = re.compile(r"^https?://")
re_domain_file = re.compile(r"^https?://([^/]*)/(?:[^#&]*/)*([^/#&]+)/?(?:#.*|&.*)*$")
def generateId(name, url):
match = re_github.fullmatch(url)
if match:
return "github:" + "/".join(match.groups("")).rstrip("/")
match = re_gitlab.fullmatch(url)
if match:
return "gitlab:" + "/".join(match.groups("")).rstrip("/")
match = re_gist.fullmatch(url)
if match:
return "gist:" + "/".join(match.groups("")).rstrip("/")
match = re_domain_file.fullmatch(url)
if match:
domain = match.groups()[0]
filename = match.groups()[1]
return f"{domain}:{filename}"
return "XXX" # FIXME
def uniquefy(identifier, name, test):
unique = identifier
if unique in test:
unique = f"{identifier}/{name}"
alt = f"{identifier}/{test[identifier]['name']}"
test[alt] = test.pop(identifier)
counter = 2
while unique in test:
unique = f"{identifier}-{counter}"
counter += 1
return unique
def extractText(element):
texts = element.find_all(text=True)
return "".join(texts[1:]).strip()
def normalizeType(type):
type = type.strip().lower()
if type[-1] == "s":
type = type[:-1]
if type == "c plugin":
type = "C plugin"
return type
allscripts = {}
for entry in elements:
if entry.name == "h2":
type = entry.text
type = normalizeType(type)
continue
a = entry.find("a")
if a is None:
continue
name = entry.find("a").text
url = entry.find("a").attrs["href"]
script = {}
scriptID = uniquefy(generateId(name, url), name, allscripts)
script["name"] = name
script["url"] = url
script["type"] = type
match = re_github.fullmatch(url)
if match:
groups = match.groups()
script["receiving_url"] = "https://github.com/%s/%s" % groups[0:2]
script["install_dir"] = "github/%s/%s" % groups[0:2]
if groups[2] and type == "lua script" and groups[2][-4:] == ".lua":
script["scriptfiles"] = [groups[2]]
else:
match = re_gitlab.fullmatch(url)
if match:
groups = match.groups()
script["receiving_url"] = "https://gitlab.com/%s/%s" % groups[0:2]
script["install_dir"] = "gitlab/%s/%s" % groups[0:2]
if groups[2] and type == "lua script" and groups[2][-4:] == ".lua":
script["scriptfiles"] = [groups[2]]
p = entry.find("p")
if p:
desc = extractText(p)
script["desc"] = desc
script["os"] = []
if re_linux.search(desc):
script["os"].append("Linux")
if re_windows.search(desc):
script["os"].append("Windows")
if re_mac.search(desc):
script["os"].append("Mac")
if re_unix.search(desc):
script["os"] += ["Linux", "Mac"]
allscripts[scriptID] = script
allscripts = updatestars(allscripts)
pprint(allscripts)
with open("mpv_script_directory.json", "w") as f:
json.dump(allscripts, f, indent=4)