forked from FooSoft/mangle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractpdf2.py
70 lines (48 loc) · 1.92 KB
/
extractpdf2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python
import sys
import os
import time
def create_extract_directory(file_path):
# Extract the base name without extension
base_name = os.path.splitext(os.path.basename(file_path))[0]
dir_name = os.path.dirname(file_path)
# Append the suffix
new_dir_name = os.path.join(dir_name, f"{base_name}_extract")
# Create the directory
os.makedirs(new_dir_name, exist_ok=True)
return new_dir_name
import re
import fitz # PyMuPDF
import io
from PIL import Image
def extract_jpg_from_pdf(pdf_path, output_folder):
# Open the PDF file
pdf_document = fitz.open(pdf_path)
# Iterate through the pages
for page_number in range(len(pdf_document)):
page = pdf_document[page_number]
image_list = page.get_images(full=True)
# Print the number of images found in this page
print(f"[INFO] Found {len(image_list)} images on page {page_number + 1}")
# Iterate through the images
for img_index, img in enumerate(image_list):
# Extract the image bytes
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image = Image.open(io.BytesIO(image_bytes))
print(f"[INFO] Extracted image {img_index + 1} with extension: {image_ext}")
# Only save if the image is in JPG format
if image_ext == "jpeg":
image_name = f"page_{page_number + 1}_img_{img_index + 1}.jpg"
image_path = f"{output_folder}/{image_name}"
image.save(image_path)
print(f"[INFO] Saved image: {image_path}")
def main():
filename = sys.argv[1]
extract_dir = create_extract_directory(filename)
extract_jpg_from_pdf(filename, extract_dir)
sys.exit(0)
if __name__ == '__main__':
main()