-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_PDFs.py
59 lines (47 loc) · 1.7 KB
/
scraping_PDFs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""
used to scrape PDFs from a website
"""
import os
from requests_html import HTMLSession
from pathlib import Path
from PyPDF2 import PdfFileReader
import hashlib
# Step 1: Specify the URL of the website to scrape
website_url = 'www.urlhere.com' # Replace with the target URL
# Create a directory to save PDFs
output_dir = Path("downloaded_pdfs")
output_dir.mkdir(exist_ok=True)
# Step 2: Scrape the website to find all PDF links
def get_pdf_links(url):
session = HTMLSession()
response = session.get(url)
response.html.render(sleep=1) # Wait for JavaScript to execute
# Collect all possible links to PDFs
pdf_links = [a.absolute_links.pop() for a in response.html.find('div.download-module a') if a.absolute_links]
return pdf_links
# Step 3: Download PDFs
def download_pdf(pdf_url):
# Extract the PDF name from the URL
pdf_name = pdf_url.split("/")[-1]
# If the PDF name is empty, generate a name from the URL
if not pdf_name:
pdf_name = 'file_' + hashlib.md5(pdf_url.encode()).hexdigest() + '.pdf'
pdf_path = output_dir / pdf_name
print(f"Downloading {pdf_name}...")
try:
# Download the PDF
session = HTMLSession()
response = session.get(pdf_url)
with open(pdf_path, 'wb') as f:
f.write(response.content)
print(f"Saved {pdf_name} to {pdf_path}")
except Exception as e:
print(f"Failed to download {pdf_url}, Error: {str(e)}")
# Step 4: Execute the script
if __name__ == "__main__":
# Get all PDF links from the website
pdf_links = get_pdf_links(website_url)
print(f"Found {len(pdf_links)} PDF links.")
# Download each PDF
for pdf_link in pdf_links:
download_pdf(pdf_link)