-
Notifications
You must be signed in to change notification settings - Fork 0
/
nvpin.py
183 lines (162 loc) · 6.69 KB
/
nvpin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
import re
import requests
import traceback
from bs4 import BeautifulSoup
class GaoH:
def __init__(self):
self.base_url = "http://www.gaohbook.net/"
self.title_pattern = r'<span class="title">(.*)</span>'
self.author_pattern = r'<a href="/author/.+">(.+)</a>'
self.description_pattern = r'(?s)<div class="description">(.*?)</div>'
self.ul_pattern = r'(?s)<ul class="nav chapter-list">(.*?)</ul>'
self.href_pattern = r'<li><a href="(.*)">.*</a></li>'
def catch_chapter(self, book_id):
print("获取章节列表...")
url = self.base_url + "book/%s.html"
try:
res = requests.get(url % str(book_id))
text = res.text
except:
print("小说页面获取失败: %s" % traceback.format_exc())
return None, None, None
book_name = re.search(self.title_pattern, text).group(1)
book_name = book_name.lstrip('《')
book_name = book_name.rstrip('》')
author = re.search(self.author_pattern, text).group(1)
des = re.search(self.description_pattern, text).group(1).split('<br />')
des = [data.strip() for data in des]
des = '\n'.join(des)
title = "%s\n\n作者:%s\n\n%s\n\n\n\n" % (book_name, author, des)
chap_ul = re.search(self.ul_pattern, text)
if chap_ul:
print("获取章节列表成功!")
chap_str = chap_ul.group(1)
else:
print("获取章节列表失败!")
return None, None, None
chap_list = chap_str.split("\n")
chap_id_list = list()
for chap in chap_list:
if chap == "":
continue
res = re.match(self.href_pattern, chap)
if res:
url = res.group(1)
chap_id_list.append(url)
if len(chap_id_list) == 0:
print("获取章节id列表失败!")
return None, None, None
return book_name, title, chap_id_list
def catch_content(self, addr):
print("获取%s正文内容..." % addr)
url = self.base_url + addr
try:
res = requests.get(url)
html = res.text
except:
print("小说页面%s获取失败: %s" % (addr, traceback.format_exc()))
return None, None
pattern = r'(?s)<div class="content">(.*?)</div>'
res = re.search(pattern, html)
if res:
text_str = res.group(1)
else:
print("获取正文内容失败!")
return None, None
text_list = text_str.split("<br />")
text_list = [text.strip(' ') for text in [text.strip() for text in text_list]]
text_list = [" %s" % text for text in text_list if text != '']
chap_name = text_list[0].strip()
return chap_name, '\n'.join(text_list[1:])
def run(self, book_id):
print("开始...")
try:
book_name, title, chap_list = self.catch_chapter(book_id)
except:
print("章节获取失败:%s" % traceback.format_exc())
return
with open(('%s.txt' % book_name), 'w') as txt:
txt.write(title)
for chap in chap_list:
count = chap_list.index(chap)
name, data = self.catch_content(chap)
if data is '':
print('%s章节获取内容为空,已停止。')
break
name = "第%d章:%s\n" % (count + 1, name)
txt.write(name)
txt.write(data)
txt.write('\r\n\r\n')
print("下载完毕!")
class YuBook:
def __init__(self):
self.base_url = 'https://m.yubook.la'
self.headers = {
'Host': 'm.yubook.la',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive'
}
def run(self, addr):
print("开始...")
url = "%s%s" % (self.base_url, addr)
try:
res = requests.get(url, headers=self.headers)
res.encoding = res.apparent_encoding
html = res.text
except:
print("章节列表%s获取失败: %s" % (addr, traceback.format_exc()))
return
soup = BeautifulSoup(html, "lxml")
book_name = soup.find_all("div", class_="nav_name", limit=1)[0].text
author = soup.find_all("p", class_="p1", limit=1)[0].text
des = soup.find_all("p", class_="p2")[1].text.strip()
start_page = soup.find_all("div", class_="nav_p2")[0].a['href']
tt = "%s\n\n%s\n\n%s\n\n\n\n" % (book_name, author, des)
with open(('%s.txt' % book_name), 'w') as txt:
txt.write(tt)
print("获取%s正文内容..." % addr)
url = "%s%s" % (self.base_url, start_page)
while True:
try:
res = requests.get(url, headers=self.headers)
res.encoding = res.apparent_encoding
html = res.text
except:
print("章节%s获取失败: %s" % (addr, traceback.format_exc()))
return
ch_name, novel_ctt, next_page = self.catch_content(html)
txt.write(ch_name)
txt.write('\r\n')
txt.write(novel_ctt)
txt.write('\r\n\r\n')
if next_page is None:
print("下载完毕!")
break
url = "%s%s" % (self.base_url, next_page)
print("获取%s正文内容..." % next_page)
@staticmethod
def catch_content(html):
soup = BeautifulSoup(html, "lxml")
ch_name = soup.find_all('h1')[1].text
ch = re.search(r'(\d+)[::](.+)', ch_name)
count = ch.group(1)
name = ch.group(2)
ch_name = "第%s章:%s" % (count, name)
ctt = soup.find_all("div", id="novelcontent", class_="novelcontent")[0].text.strip()
ctt = '\n'.join([" %s" % text for text in ctt.split(" ")])
ad = soup.find_all("a", class_="p4")[0].attrs['href']
if ad[-4:] != 'html':
return ch_name, ctt, None
else:
return ch_name, ctt, ad
if __name__ == "__main__":
gh = GaoH()
bid = input("请输入整型小说id:")
gh.run(bid)
# yb = YuBook()
# pg = input("请输入整型小说id:")
# yb.run(pg)