-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
executable file
·154 lines (116 loc) · 4.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import requests
import re
import os
import sys
from multiprocessing.pool import ThreadPool
#CONFIG
thumbnails_regex_list = [
r'-\d{1,4}x\d{1,4}$', #match: -421x421
r'-e\d{13}$', #match: -e1676678024195
r'-scaled$',
r'-modified',
]
#END CONFIG
if os.path.exists(os.path.abspath(sys.argv[0])):
os.chdir(os.path.dirname(os.path.abspath(sys.argv[0])))
if len(sys.argv) < 2:
print("usage: manin.py [site]")
print("e.g. python3 main.py wp-site.com")
exit()
site = sys.argv[1]
if site.split("://")[0] not in ["http", "https"]:
site = "http://"+ site
site = os.path.join(site,'wp-content','uploads/')
content_list = []
def get_content_urls():
global content_list
print("\nSearching content to download...", flush=True)
dates = []
#Get directories by year
response = requests.get(site)
years = re.findall(r'alt="\[DIR\]"> <a href="(\d{2,4})\/"', str(response.content.decode()))
#Set directories to examinate
for year in years:
response = requests.get(site+"/"+year)
#Get directories by month
months=re.findall(r'alt="\[DIR\]"> <a href="(\d{2})\/"', str(response.content.decode()))
for month in months:
dates.append(year+'/'+month)
#Get all urls to download
for date in dates:
print("Listing "+site+date, flush=True, end="")
#Get files ordered by size desc
response = requests.get(site+date+"?C=S;O=D")
temp_files=[]
for file in re.findall(r'alt="\[(?!PARENTDIR).*\]"> <a href="(.+)"', str(response.content.decode())):
temp_files.append(os.path.join(site,date,file))
print(" - "+str(len(temp_files))+" files found")
content_list+=temp_files
f = open(os.getcwd()+"/last_fetched_urls.txt", "w")
f.write(str(content_list).replace(",",",\n"))
f.close()
print(' ===Total: '+str(len(content_list))+' files found===\n')
#Create Directories
create_dirs(dates)
#Create Directories
def create_dirs(dates=[]):
for date in dates:
dirname = os.path.join(os.getcwd(),site.replace("https://","").replace("http://","")+date)
if not os.path.exists(dirname):
print("New path created: "+dirname.replace(os.getcwd()+"/",""))
os.makedirs(dirname)
def remove_already_downloaded():
global content_list
print("Skipping already downloaded files...", flush=True,)
new_urls_list =[]
for url in content_list:
file_path=url.replace("https://","").replace("http://","")
if not os.path.exists(os.path.join(os.getcwd(),file_path)):
new_urls_list.append(url)
print("Files skipped: "+ str(len(content_list)-len(new_urls_list)))
print("==="+ str(len(new_urls_list))+" files to download===")
content_list = new_urls_list[:]
def remove_thumbnails():
if len(thumbnails_regex_list) == 0:
return
global content_list
print("Skipping thumbnails...", flush=True,)
thumbnails_count = 0
for regex in thumbnails_regex_list:
for image in content_list[:]:
sub_res=re.search(regex,image.split(".")[-2])
#If it is a thumbnail, search for the original file
if sub_res is not None:
if image.replace(sub_res[0],"") in content_list:
#Remove only if htere is a original file
content_list.remove(image)
thumbnails_count += 1
remove_local_file(image)
print("Thumbnails found: "+ str(thumbnails_count))
print("==="+str(len(content_list))+ " files remaining===\n")
def remove_local_file(remote_url):
file_path=os.path.join(os.getcwd(),remote_url.replace("https://","").replace("http://",""))
if os.path.exists(file_path):
os.remove(file_path)
#Download video
def download(url):
response = requests.get(url, stream=True)
file_path=os.path.join(os.getcwd(),url.replace("https://","").replace("http://",""))
if response.status_code == 200:
with open(file_path, 'wb') as file:
for data in response:
file.write(data)
return url+" - Success"
else:
return url+" - error server returned "+ str(response.status_code)
def start():
# Run 10 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download, content_list)
for r in results:
print(r)
get_content_urls()
remove_thumbnails()
remove_already_downloaded()
if len(content_list) > 0:
print("\nDownloading...", )
start()