-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-pdf-images.py
59 lines (49 loc) · 1.91 KB
/
extract-pdf-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Extract the highest resolution image from the PDF file, instead of using the lower resolution images in the Document.zip file
import argparse
from glob import glob
import os
import shutil
from typing import Any
import zipfile
from os.path import exists
from tqdm import tqdm
import logging
from utilities import setup_parser, setup_logging
import fitz
def extract_images(pdf_file, image_path):
pdf = fitz.open(pdf_file)
for i in range(len(pdf)):
pdf_page = pdf[i]
images = pdf_page.get_images()
if len(images) != 1:
raise ValueError("Expected exactly one image per page")
img_data = images[0]
img = pdf.extract_image(img_data[0])
img_file = os.path.join(image_path, f'FullPg{i+1:02}.{img["ext"]}')
with open(img_file, "wb") as fout:
fout.write(img['image'])
def main():
parser = setup_parser()
args = parser.parse_args()
setup_logging(args)
logging.info(f'Extracting all images from PDFs in {args.base}')
print('Extracting all images from PDFs in {args.base}')
pdf_files = glob(os.path.join(args.base, '**', '*.pdf'), recursive=True)
skipped = extracted = 0
for pdf_file in tqdm(pdf_files):
folder_path = os.path.join(os.path.dirname(pdf_file), 'pdf-images')
if os.path.isdir(folder_path):
if not args.overwrite:
logging.info(f"Skipping ${folder_path}, it already exists")
skipped += 1
continue
else:
logging.debug(f"Removing existing folder ${folder_path}")
shutil.rmtree(folder_path)
logging.info(f"Extracting images from {pdf_file} to {folder_path}")
os.makedirs(folder_path, exist_ok=True)
extract_images(pdf_file, folder_path)
extracted += 1
print(f'Extracted images from {extracted} PDF files, skipped {skipped} files')
if __name__ == '__main__':
main()