-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathCrawling.py
54 lines (47 loc) · 1.53 KB
/
Crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#-*- encoding:utf-8 -*-
import requests
import json
import re
from bs4 import BeautifulSoup
seed = "https://cloud.tencent.com/document/product/1709"
baseUrl="https://cloud.tencent.com"
appendUrlList=[]
appendDataList = []
# 获取各栏目URL
def getCrawl(seed):
seeds = []
seeds.append(seed)
textdata = requests.get(seed).text
soup = BeautifulSoup(textdata,'lxml')
nodes = soup.select("textarea.J-qcSideNavListData")
jsonObj=json.loads(nodes[0].getText())["list"]
seeds.append(nodes)
getChild(jsonObj)
def getChild(nowObj):
if nowObj is not None:
for n in nowObj:
links= baseUrl+n["link"]
data={"title":n["title"],"link":links}
appendUrlList.append(data)
if n.get("children") is not None:
getChild(n.get("children"))
def crawlData():
getCrawl(seed)
count=0
for data in appendUrlList:
url = data["link"]
print(data["title"]+" "+data["link"])
# print(url)
textdata = requests.get(url).text
soup = BeautifulSoup(textdata,'lxml')
nodes = soup.select("div.J-markdown-box")
if nodes is not None and len(nodes)>0:
text = nodes[0].get_text()
text = text[:6000]
stringText = re.sub('\n+', '\n', text)
data={"url":url,"title":data["title"],"text":stringText}
appendDataList.append(data)
# count=count+1
# if count>6:
return appendDataList
# print(len(appendDataList))