-
Notifications
You must be signed in to change notification settings - Fork 0
/
dl.py
144 lines (127 loc) · 4.63 KB
/
dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import io
import json
import os
import re
import warnings
from argparse import ArgumentParser
from collections import namedtuple
from urllib.parse import urlparse
import requests as rqs
from bs4 import BeautifulSoup as bs
from fpdf import FPDF
from PIL import Image
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.136 Safari/537.36"
}
RMK_H = 1872
RMK_W = 1404
def progBar(i, tot, message="Loading"):
y = i*100//tot
n = 100-y
print(f"{message} : \t[{'█'*y}{'_'*n}]", end='\r')
if n <= 0:
print(f"{message} - DONE{' '*105}", end='\n')
def parseNumList(sourceString):
l = {}
for num in sourceString.split(','):
m = re.match(r'^(\d+)(?:-(\d+))?$', num)
if not m:
raise AttributeError(
"The second parameter must be the selected chapters by numbers (ex: 1,3,6-9 will select chap. 1, 3 and the chaps. 6 to 9)")
start = int(m.group(1))
if m.group(2) and int(m.group(2)) >= int(m.group(1)):
stop = int(m.group(2))
else:
stop = start
for i in range(start, stop+1):
l[i] = 1
return sorted(l.keys())
def getSiteSpecs(url):
netloc = urlparse(url).netloc
allSpecs = json.load(open("./specs.json", 'r'))
specsType = namedtuple('Specs', allSpecs["default"].keys())
if netloc and netloc in allSpecs:
return specsType._make(allSpecs[netloc].values())
else:
raise AttributeError("Bad url")
def handleError(err, url, args, action):
retryMessage = f"An error has occured!\nurl : {url}\nerror : {str(err)}\nWould you like to continue? [y/n] "
match action:
case "skip":
pass
case "retry":
scrap(url, args, "2ndtime")
case "raise":
raise
case "2ndtime":
print(
"It's the 2nd time this error has occured, maybe there's a real problem!")
raise
case "ask":
if input(retryMessage) == 'y':
scrap(url, args, "ask")
else:
raise
case _:
if input(retryMessage) == 'y':
scrap(url, args, "ask")
else:
raise
def scrap(chapUrl, args, onError=None):
# if not onError:
# onError = args.onError
S = getSiteSpecs(chapUrl)
try:
soup = bs(rqs.get(chapUrl, headers=HEADERS).text, "lxml")
except rqs.exceptions.ConnectionError as err:
handleError(err, chapUrl, args, onError or args.onError)
title = re.sub(r'[\s/\\.,]+', '_', soup.select(S.title)[0].text.strip())
progBar(0, 1, title)
pageList = []
n = 1
while True:
if S.pageFormat:
soup = bs(rqs.get(S.pageFormat.format(
url=chapUrl, n=n), headers=HEADERS).text, "lxml")
for n, image in enumerate(soup.select(S.img)):
if args.pagesToSkip and n+1 in args.pagesToSkip:
continue
pageList.append(image.get(S.src))
if not S.nextBtn or not soup.select(S.nextBtn):
break
n += 1
pdf = FPDF(unit="in")
pdf.set_margins(0, 0)
pdf.set_auto_page_break(False)
for n, p in enumerate(pageList):
with Image.open(io.BytesIO(rqs.get(p).content)) as image:
pdf.add_page(format=(image.width/72, image.height/72))
pdf.image(image)
progBar(n+1, len(pageList), title)
pdf.output(args.destPath+title+".pdf")
def main(args):
specs = getSiteSpecs(args.url)
progBar(0, 1, "Getting chaps list")
chaps = bs(rqs.get(args.url).text, "lxml").select(specs.chaptersListElt)
progBar(1, 1, "Getting chaps list")
if specs.chaptersListOrderDescending:
chaps = chaps[::-1]
for i in args.selectedChaps:
if i >= len(chaps):
warnings.warn(f"Chapter {i} and after are not available")
break
scrap(chaps[i].get('href'), args)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('url')
parser.add_argument(
'-s', '--select', dest='selectedChaps', type=parseNumList, default=0)
parser.add_argument('-S', '--skip', dest='pagesToSkip',
type=parseNumList, default=None)
parser.add_argument('-d', '--dest', dest='destPath',
default=f"{os.getcwd()}/chaps/")
parser.add_argument('-D', '--dim', dest='dimMethod',
choices=["freq", "max", "rmk"], default="freq")
parser.add_argument('--onError', dest='onError',
choices=["skip", "retry", "raise", "ask"], default="ask")
main(parser.parse_args())