-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadlistsanitizer.py
177 lines (143 loc) · 5.81 KB
/
adlistsanitizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
#AdList Sanitizer 0.1b#
#by M-A.Sc#
#Slow and dirty but multiplatform ;)#
# There's a bunch of Adlists floating around... With this script you can
# download multiple Adlists (lists.txt), filter and extract valid domains and subdomains entries,
# delete duplicates and specific domains and make yourself a nice customized and clean (sanitized:) AdList
import os
import requests
import re
from concurrent.futures import ThreadPoolExecutor
# parallel downloads
max_concurrent_downloads = 4
# AdList URLs
url_file = 'lists.txt'
# make dir if it doesnt exist
if not os.path.exists('download'):
os.makedirs('download')
# download-function
def download_file(url):
filename = url.split('/')[-1]
filepath = os.path.join('download', filename)
response = requests.get(url)
if response.status_code == 200:
with open(filepath, 'wb') as file:
file.write(response.content)
file_size = os.path.getsize(filepath)
num_entries = len(response.content.decode().splitlines())
print(f'{filename} was downloaded. Size: {file_size} Bytes, Number of Domains: {num_entries}')
else:
print(f'Error trying to download {filename}. Errorcode: {response.status_code}')
# multi download
def download_files(urls):
with ThreadPoolExecutor(max_workers=max_concurrent_downloads) as executor:
executor.map(download_file, urls)
# merge lists
def merge_files():
files = os.listdir('download')
merged_content = ''
for file in files:
filepath = os.path.join('download', file)
with open(filepath, 'r') as f:
file_content = f.read()
merged_content += file_content
with open('master.txt', 'w') as f:
f.write(merged_content)
print('Adlists got merged into master.txt')
# extract domain names
def extract_domain_names(file_path):
with open(file_path, 'r') as file:
content = file.read()
# Pattern
pattern = r'(?:[a-zA-Z0-9]+(?:-[a-zA-Z0-9]+)*\.)+[a-zA-Z]{2,}'
# Search for matching patterns
matches = re.findall(pattern, content)
# get rid of www.
domain_names = [re.sub(r'^www\.', '', match) for match in matches]
# return cleaned domain names
return domain_names
# Remove duplicates
def remove_duplicates(input_list):
return list(set(input_list))
# Ask if User wants to delete specific domains from sanity.txt
def delete_domains(delete_domains):
# Load sanity.txt
sanity_file_path = 'sanity.txt'
with open(sanity_file_path, 'r') as sanity_file:
domain_names = sanity_file.read().splitlines()
# Domainname entries
num_entries_before = len(domain_names)
# Delete Domain from List
updated_domains = [domain for domain in domain_names if domain not in delete_domains]
# rewrite sanity.txt
with open(sanity_file_path, 'w') as sanity_file:
sanity_file.write('#### adlist sanitizer 0.1b ####\n\n')
for domain_name in updated_domains:
sanity_file.write(domain_name + '\n')
# Calculation of entries
num_entries_deleted = num_entries_before - len(updated_domains)
print(f'These Domains got deleted from sanity.txt: {", ".join(delete_domains)}')
print(f'{num_entries_deleted} got deleted with its subdomains.')
# delete temporary data function
def delete_files():
# delete download dir
if os.path.exists('download'):
for file in os.listdir('download'):
file_path = os.path.join('download', file)
os.remove(file_path)
os.rmdir('download')
print(" Folder download got deleted.")
# delete temp .txt
if os.path.exists('output.txt'):
os.remove('output.txt')
print("'output.txt' got deleted.")
if os.path.exists('master.txt'):
os.remove('master.txt')
print("'master.txt' got deleted.")
# Ask if ..
def prompt_user():
while True:
delete_option = input('Do you want to remove some Domains (google.com) from sanity.txt? (y/n): ')
if delete_option.lower() == 'y':
domains_to_delete = input('Enter the Domains you want to delete seperated by space: ')
delete_domains(domains_to_delete.split())
elif delete_option.lower() == 'n':
break
else:
print('Invalid input please enter "y" or "n".')
delete_files_option = input('Do you want to delete temporary files and folders? (y/n)')
if delete_files_option.lower() == 'y':
delete_files()
# Statistics and output
def display_entry_stats():
num_entries_master = len(domain_names)
num_entries_sanity = len(unique_domain_names)
percentage_difference = (num_entries_sanity / num_entries_master) * 100
print(f"master.txt has {num_entries_master} Number of Domains.")
print(f"The sanitized 'sanity.txt' has {num_entries_sanity} Number of Domains.")
print(f"The final list is just {percentage_difference:.2f}% the size of master.txt.")
# main
if os.path.exists(url_file):
with open(url_file, 'r') as file:
urls = [line.strip() for line in file.readlines() if line.strip()]
download_files(urls)
merge_files()
# extract domains
domain_names = extract_domain_names('master.txt')
# put domain names into output.txt
output_file_path = 'output.txt'
with open(output_file_path, 'w') as output_file:
for domain_name in domain_names:
output_file.write(domain_name + '\n')
# delete duplicated and save in sanity.txt
unique_domain_names = remove_duplicates(domain_names)
sanity_file_path = 'sanity.txt'
with open(sanity_file_path, 'w') as sanity_file:
sanity_file.write('#### adlist sanitizer 0.1b ####\n\n')
for domain_name in unique_domain_names:
sanity_file.write(domain_name + '\n')
prompt_user()
display_entry_stats()
else:
print(f'Can not find "{url_file}"')