-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
74 lines (70 loc) · 2.76 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
import time
from bs4 import BeautifulSoup as bs
import os
import re
import urllib.request
import json
from requests_html import HTMLSession
def dirCreator(src):
if not os.path.isdir(saveDir):
print("Dir not exist")
print(f"Create a new Dir which name is {saveDir} in current path.")
os.mkdir(saveDir)
else:
print("Dir existed.")
#Create a Dir to save photo.
saveDir="./Beauty2/"
dirCreator(saveDir)
# #Get content in the website.
#1500-2500
#2500-3495
beautyUrlCount=3400
fileNameCount = 0
timeCount=0
while beautyUrlCount<=3495:
try:
session = HTMLSession()
# beautyTitleUrl = "https://www.ptt.cc/bbs/Beauty/index"+str(beautyUrlCount)+".html"
beautyTitleUrl = "https://www.ptt.cc/bbs/Beauty/index4004.html"
r = session.get(beautyTitleUrl, cookies = {'over18': '1'})
getTitle = r.html.find(".title")
# print(temp)
print(f"Curren Beauty Url Count {beautyUrlCount}")
for i in getTitle:
# print(i.text)
if "正妹" in i.text:
if "肉特" in i.text:
break
for article in i.find('a'):
# print(type(list(j.absolute_links)))
articleList = list(article.absolute_links)
print(articleList[0])
#Get image
r1 = session.get(articleList[1], cookies = {'over18': '1'})
getContent = r1.html.find("#main-content")
for c in getContent:
getImageUrl= c.find('a')
#過濾掉最後一個網址
imgCount, endCount= 0, len(getImageUrl)-1
timeCount=0
for image in getImageUrl:
# print(imgCount)
if imgCount==endCount:
break
if imgCount%2==0:
imageUrlToList = list(image.absolute_links)
imageUrl = imageUrlToList[0]
if "imgur" not in imageUrl:
continue
if "jpg" in imageUrl:
image = HTMLSession().get(imageUrl)
with open(saveDir+"beauty"+str(fileNameCount)+".jpg", "wb") as f:
f.write(image.content)
fileNameCount+=1
imgCount+=1
time.sleep(timeCount)
beautyUrlCount+=1
except Exception as e:
print(f"Get Error {e}, Current beautyUrlCount {beautyUrlCount}")
timeCount+=5