-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDelfiMarkdownScraper.py
284 lines (235 loc) · 11.7 KB
/
DelfiMarkdownScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import json
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext, ttk
from datetime import datetime
import re
from urllib.parse import urlparse
import subprocess
SETTINGS_FILE = 'scraper_settings.json'
untitled_article_count = 0
def load_settings():
if os.path.exists(SETTINGS_FILE):
with open(SETTINGS_FILE, 'r') as f:
return json.load(f)
return {}
def save_settings(settings):
with open(SETTINGS_FILE, 'w') as f:
json.dump(settings, f)
# Function to append messages to the console and log file
def append_to_console(message, log_file, message_type='info', console_only=True):
now = datetime.now().strftime("%H:%M:%S")
formatted_message = f"[{now}] {message}\n"
# Append to the console only if console_only is True
if console_only:
if message_type == 'success':
console.tag_config('success', foreground='green')
console.insert(tk.END, formatted_message, 'success')
elif message_type == 'error':
console.tag_config('error', foreground='red')
console.insert(tk.END, formatted_message, 'error')
else:
console.insert(tk.END, formatted_message)
console.yview(tk.END) # Auto-scroll to the end
# Always append to the log file
with open(log_file, 'a', encoding='utf-8') as log:
log.write(formatted_message)
def get_domain(url):
return urlparse(url).netloc
# Function to scrape a single URL and save its content
def scrape_article(url, tag, class_name, title_tag, title_class, subtitle_tag, subtitle_class, save_dir, log_file):
global untitled_article_count
try:
domain = get_domain(url)
response = requests.get(url)
response.raise_for_status() # Check for request errors
soup = BeautifulSoup(response.text, 'html.parser')
# Extract the article title
if title_tag and title_class:
title_element = soup.find(title_tag, class_=title_class)
else:
title_element = soup.find('h1') # Default to first h1 if not specified
if title_element and title_element.text.strip():
article_title = title_element.text.strip()
else:
untitled_article_count += 1
article_title = f"Untitled Article {untitled_article_count}"
# Extract the subtitle (if configured)
subtitle = ""
if subtitle_tag and subtitle_class:
subtitle_element = soup.find(subtitle_tag, class_=subtitle_class)
if subtitle_element:
subtitle = subtitle_element.text.strip()
# Extract the article content using the tag and class_name
if tag and class_name:
article_content = soup.find(tag, class_=class_name)
else:
article_content = soup.find('div', class_='article-content') # Default if not specified
if article_content:
html_content = str(article_content)
else:
html_content = '<p>Article content not found.</p>'
# Convert HTML to Markdown using markdownify
markdown_content = md(html_content, heading_style="ATX")
# Create a valid filename from the article title
valid_filename = re.sub(r'[^\w\-_\. ]', '_', article_title)
valid_filename = valid_filename.replace(' ', '_')
file_path = os.path.join(save_dir, f'{valid_filename}.md')
# Ensure unique filename
counter = 1
while os.path.exists(file_path):
file_path = os.path.join(save_dir, f'{valid_filename}_{counter}.md')
counter += 1
# Save the article content to a Markdown file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(f"# {article_title}\n\n")
if subtitle:
file.write(f"## {subtitle}\n\n")
file.write(markdown_content)
append_to_console(f"Article '{article_title}' has been saved", log_file, 'success', console_only=True)
# This line will only be written to the log file, not displayed in the console
append_to_console(f"Article '{article_title}' from {url} has been saved to {file_path}", log_file, 'success', console_only=False)
except Exception as e:
append_to_console(f"Failed to scrape article from {domain}", log_file, 'error', console_only=True)
append_to_console(f"Failed to scrape {url}: {e}", log_file, 'error')
# Load the configuration file based on the domain
def load_config_for_domain(domain):
# Remove 'www.' if present and replace '.' with '_'
config_filename = domain.replace('www.', '').replace('.', '_') + '.json'
config_path = f"./configs/{config_filename}"
if os.path.exists(config_path):
with open(config_path, 'r') as file:
return json.load(file)
else:
return None
# Main function to process URLs from a file with site configurations
def process_urls(urls_file_path, save_dir):
global untitled_article_count
untitled_article_count = 0 # Reset the counter at the start of each scraping session
log_file = os.path.join(save_dir, 'log.txt') # Log file path
try:
with open(urls_file_path, 'r') as file:
urls = file.readlines()
for url in urls:
url = url.strip()
if url:
domain = get_domain(url)
append_to_console(f"Processing: {domain}", log_file, console_only=True)
append_to_console(f"Processing URL: {url}", log_file, console_only=False)
append_to_console(f"Extracted domain: {domain}", log_file)
config = load_config_for_domain(domain)
if config:
append_to_console(f"Config loaded for {domain}", log_file, 'success')
tag = config.get("content_tag")
class_name = config.get("content_class")
title_tag = config.get("title_tag")
title_class = config.get("title_class")
subtitle_tag = config.get("subtitle_tag")
subtitle_class = config.get("subtitle_class")
scrape_article(url, tag, class_name, title_tag, title_class, subtitle_tag, subtitle_class, save_dir, log_file)
else:
append_to_console(f"No configuration found for: {domain}. Article not saved.", log_file, 'error', console_only=False)
append_to_console(f"No configuration file found for domain: {domain}. Article not saved.", log_file, 'error')
else:
append_to_console("Empty URL encountered", log_file)
append_to_console("Processing completed. Check the log for details.", log_file, 'success')
except Exception as e:
append_to_console(f"An error occurred: {e}", log_file, 'error')
# GUI for selecting files and directories
def browse_file(entry):
file_path = filedialog.askopenfilename()
if file_path:
entry.delete(0, tk.END)
entry.insert(0, file_path)
save_settings({'last_urls_file': file_path, 'last_save_dir': save_entry.get()})
def browse_directory(entry):
dir_path = filedialog.askdirectory()
if dir_path:
entry.delete(0, tk.END)
entry.insert(0, dir_path)
save_settings({'last_urls_file': urls_entry.get(), 'last_save_dir': dir_path, 'last_config': config_var.get()})
# Function to load config files
def load_configs():
config_dir = './configs'
if not os.path.exists(config_dir):
os.makedirs(config_dir)
config_files = [f for f in os.listdir(config_dir) if f.endswith('.json')]
return {f: json.load(open(os.path.join(config_dir, f))) for f in config_files}
# Modified start_scraping function
def start_scraping():
urls_file_path = urls_entry.get()
save_dir = save_entry.get()
if not urls_file_path or not save_dir:
messagebox.showwarning("Input Error", "Please fill all fields.")
return
save_settings({'last_urls_file': urls_file_path, 'last_save_dir': save_dir})
log_file = os.path.join(save_dir, 'log.txt')
append_to_console("Starting the scraping process...", log_file)
process_urls(urls_file_path, save_dir)
def clear_console():
console.delete(1.0, tk.END)
append_to_console("Console cleared.", log_file, 'info', console_only=True)
def open_save_location():
save_dir = save_entry.get()
if os.path.exists(save_dir):
if os.name == 'nt': # For Windows
os.startfile(save_dir)
elif os.name == 'posix': # For macOS and Linux
subprocess.call(('open', save_dir))
else:
append_to_console("Unsupported operating system for opening folders.", log_file, 'error')
else:
append_to_console("Save directory does not exist.", log_file, 'error')
# Create the main window
root = tk.Tk()
root.title("Delfi Markdown Scraper")
root.geometry("600x550") # Increased height to accommodate new buttons
# Create a main frame
main_frame = ttk.Frame(root, padding="10")
main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
# Load saved settings
settings = load_settings()
# URL text file
ttk.Label(main_frame, text="URLs File:").grid(row=0, column=0, padx=5, pady=5, sticky='e')
urls_entry = ttk.Entry(main_frame, width=50)
urls_entry.grid(row=0, column=1, padx=5, pady=5, sticky=(tk.W, tk.E))
urls_entry.insert(0, settings.get('last_urls_file', ''))
ttk.Button(main_frame, text="Browse", command=lambda: browse_file(urls_entry)).grid(row=0, column=2, padx=5, pady=5)
# Config info label (replace the dropdown)
ttk.Label(main_frame, text="Configurations:").grid(row=1, column=0, padx=5, pady=5, sticky='e')
configs = load_configs()
config_count = len(configs)
config_info = f"{config_count} configurations loaded dynamically."
ttk.Label(main_frame, text=config_info).grid(row=1, column=1, padx=5, pady=5, sticky=(tk.W, tk.E))
# Save directory
ttk.Label(main_frame, text="Save Directory:").grid(row=2, column=0, padx=5, pady=5, sticky='e')
save_entry = ttk.Entry(main_frame, width=50)
save_entry.grid(row=2, column=1, padx=5, pady=5, sticky=(tk.W, tk.E))
save_entry.insert(0, settings.get('last_save_dir', ''))
ttk.Button(main_frame, text="Browse", command=lambda: browse_directory(save_entry)).grid(row=2, column=2, padx=5, pady=5)
# Start Button
ttk.Button(main_frame, text="Start Scraping", command=start_scraping).grid(row=3, column=0, columnspan=3, pady=10)
# Clear Console and Open Save Location buttons
button_frame = ttk.Frame(main_frame)
button_frame.grid(row=4, column=0, columnspan=3, pady=10)
ttk.Button(button_frame, text="Clear Console", command=clear_console).grid(row=0, column=0, padx=5)
ttk.Button(button_frame, text="Open Save Location", command=open_save_location).grid(row=0, column=1, padx=5)
# Console output
ttk.Label(main_frame, text="Console Output:").grid(row=5, column=0, columnspan=3, padx=5, pady=5)
console = scrolledtext.ScrolledText(main_frame, width=70, height=15, wrap=tk.WORD)
console.grid(row=6, column=0, columnspan=3, padx=5, pady=5, sticky=(tk.W, tk.E, tk.N, tk.S))
console.config(state=tk.NORMAL)
console.tag_config('success', foreground='green')
console.tag_config('error', foreground='red')
# Configure row and column weights
for i in range(7): # Increased to 7 to account for the new row
main_frame.rowconfigure(i, weight=1)
for i in range(3):
main_frame.columnconfigure(i, weight=1)
# Run the application
root.mainloop()