extract offending pages when PyPDF throws an error (also a small PDF …

…page extraction script) (#6) * PdfFile.extract_page_rage() * refactor overwrite confirmations * adjust to single page if 0 requested * Better parser for PDF page extract * Looking good * Page ranges work on extracting text * Make it work * attention_getting_panel() * .gitignore * ICP rules * README * style * Check dir exists before create * Update help screenshots * fix image in readme * comment * cruft --------- Co-authored-by: ashariyar <[email protected]>
michelcrypt4d4mus · Jul 30, 2023 · 0993917 · 0993917
1 parent fb14f89
commit 0993917
Show file tree

Hide file tree

Showing 23 changed files with 395 additions and 118 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/
@@ -132,3 +131,4 @@ dmypy.json
 tests/tmp/
 .clown_sort
 custom_sort_rules.csv
+PDF Errors/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,13 @@
 # NEXT RELEASE
+* New script `extract_pages_from_pdf` lets you easily rip pages out of a PDF
+* Add `--page-range` argument to both `extract_pages_from_pdf` and `extract_text_from_files`
+* `pypdf` exceptions will trigger the offending page to be extracted and a suggestion made to the user that they submit the page to the `pypdf` team
+* Bump `pypdf` to version 3.14.0 (fixes for many bugs on edge case PDFs)
 * Better handling of sort rules that fail to parse
-* Suppress `/JBIG2Decode` warning output
 * New crypto sort rules
+* Rename `--print-when-parsed` command line option to `--print-as-parsed`
+* Suppress `/JBIG2Decode` warning output when decoding PDFs
+* Refactor overwrite confirmation, use stderr
 
 ### 1.9.2
 * Allow `Pillow` 10.0.0

diff --git a/README.md b/README.md
@@ -136,7 +136,10 @@ pipx install clown_sort[gui]
 
 
 ## One Off Extractions
-`extract_text_from_files` (installed at the same time) is a convenient script you can use to extract text from a single file, multiple files, or all the files in a given directory using Google's best in class Tesseract library.
+There are several utilities / convenience scripts / whatever you want to call them that are installed along with `clown_sort`. They work with the same toolset as `sort_screenshots` but use it to simplify the extraction of text from images (or PDFs that are actually just a set of page sized images).
+
+#### Extracting text
+`extract_text_from_files` is a convenient script you can use to extract text from a single file, multiple files, or all the files in a given directory using Google's best in class Tesseract library.
 
 ![](doc/extract_text_from_files_help.png)
 
@@ -148,6 +151,10 @@ extract_text_from_files MY_FILE1 MY_FILE2 SOME_DIR3
 
 This will parse and display the text in `MY_FILE1`, `MY_FILE2`, and all the files in `SOME_DIR3`.
 
+#### Extracting pages of a PDF to a new PDF
+`extract_pages_from_pdf` is a small script that can extract page ranges (e.g. "10-25") from PDFs on the command line.
+![](doc/extract_pages_from_pdf_help.png)
+
 
 # Contributing
 Feel free to file issues or open pull requests.

diff --git a/clown_sort/__init__.py b/clown_sort/__init__.py
@@ -1,4 +1,6 @@
-import sys
+"""
+Entry point for all of the clown_sort scripts.
+"""
 from argparse import Namespace
 from glob import glob
 from os import environ, getcwd, path
@@ -14,7 +16,7 @@
             load_dotenv(dotenv_path=dotenv_file)
             break
 
-from clown_sort.util.argument_parser import extraction_parser
+from clown_sort.util.argument_parser import parse_text_extraction_args, parse_pdf_page_extraction_args
 from clown_sort.config import Config
 from clown_sort.files.image_file import ImageFile
 from clown_sort.files.pdf_file import PdfFile
@@ -48,31 +50,30 @@ def extract_text_from_files() -> None:
     Extract text from a single file or from all files in a given directory. Can accept
     multiple paths as arguments on the command line.
     """
-    args: Namespace = extraction_parser.parse_args()
+    args: Namespace = parse_text_extraction_args()
     console.line()
-    files_to_process = []
 
     if args.debug:
         Config.enable_debug_mode()
-    if args.print_when_parsed:
-        Config.print_when_parsed = True
+    if args.print_as_parsed:
+        Config.print_as_parsed = True
 
-    for file_or_dir in args.file_or_dir:
-        file_path = Path(file_or_dir)
+    for file_path in args.files_to_process:
+        sortable_file = build_sortable_file(file_path)
 
-        if not file_path.exists():
-            console.print(f"File '{file_path}' doesn't exist!")
-            sys.exit(-1)
-        elif file_path.is_dir():
-            files_to_process.extend(files_in_dir(file_path))
+        if isinstance(sortable_file, PdfFile):
+            sortable_file.print_extracted_text(page_range=args.page_range)
         else:
-            files_to_process.append(file_path)
+            sortable_file.print_extracted_text()
 
-    for file_path in files_to_process:
-        build_sortable_file(file_path).print_extracted_text()
         console.line(2)
 
 
+def extract_pages_from_pdf() -> None:
+    args = parse_pdf_page_extraction_args()
+    PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
+
+
 def set_screenshot_timestamps_from_filenames():
     """Parse the filenames to reset the file creation timestamps."""
     Config.configure()

diff --git a/clown_sort/config.py b/clown_sort/config.py
@@ -4,7 +4,6 @@
 import re
 import sys
 from argparse import Namespace
-from collections import namedtuple
 from importlib.metadata import version
 from os import environ
 from pathlib import Path
@@ -19,7 +18,7 @@
 from clown_sort.util import rich_helper
 from clown_sort.util.argument_parser import parser
 from clown_sort.util.constants import PACKAGE_NAME
-from clown_sort.util.filesystem_helper import subdirs_of_dir
+from clown_sort.util.filesystem_helper import create_dir_if_it_does_not_exist, subdirs_of_dir
 from clown_sort.util.logging import log, set_log_level
 
 
@@ -34,7 +33,7 @@ class Config:
     delete_originals: bool = False
     rescan_sorted: bool = False
     yes_overwrite: bool = False
-    print_when_parsed: bool = False
+    print_as_parsed: bool = False
     sort_rules: List[SortRule] = []
     filename_regex: re.Pattern
 
@@ -119,11 +118,10 @@ def set_directories(
         cls.destination_dir: Path = Path(destination_dir or screenshots_dir)
         cls.sorted_screenshots_dir = cls.destination_dir.joinpath('Sorted')
         cls.processed_screenshots_dir = cls.destination_dir.joinpath('Processed')
+        cls.pdf_errors_dir = cls.destination_dir.joinpath('PDF Errors')
 
         for dir in [cls.destination_dir, cls.sorted_screenshots_dir, cls.processed_screenshots_dir]:
-            if not dir.is_dir():
-                log.warning(f"Need to create '{dir}'")
-                dir.mkdir(parents=True, exist_ok=True)
+            create_dir_if_it_does_not_exist(dir)
 
         cls._log_configured_paths()
 
@@ -163,6 +161,7 @@ def _log_configured_paths(cls) -> None:
         log.debug(f"destination_dir: {cls.destination_dir}")
         log.debug(f"sorted_screenshots_dir: {cls.sorted_screenshots_dir}")
         log.debug(f"processed_screenshots_dir: {cls.processed_screenshots_dir}")
+        log.debug(f"pdf_errors_dir: {cls.pdf_errors_dir}")
 
 
 def _check_for_pysimplegui():
@@ -177,7 +176,7 @@ def _check_for_pysimplegui():
         )
 
         log_optional_module_warning('gui', msg)
-        console = Console()
+        console = Console(color_system='256')
         #console.line()
         console.print(f"You make also need to install 'python-tk'. In macOS this can be installed with 'brew install python-tk'.")
         sys.exit()
@@ -201,7 +200,7 @@ def log_optional_module_warning(module_name: str, msg: Optional[Text] = None) ->
             style='bright_white'
         )
 
-    console = Console()
+    console = Console(color_system='256')
     console.line()
     console.print(msg)
     console.line()

diff --git a/clown_sort/filename_extractor.py b/clown_sort/filename_extractor.py
@@ -5,11 +5,8 @@
 from difflib import SequenceMatcher
 from typing import Optional
 
-from rich.text import Text
-
 from clown_sort.util.logging import log
-from clown_sort.util.rich_helper import console
-from clown_sort.util.string_helper import strip_bad_chars, strip_mac_screenshot
+from clown_sort.util.filesystem_helper import strip_bad_chars, strip_mac_screenshot
 
 TWEET_REPLY_REGEX = re.compile(
     'Replying to (@\\w{3,15}).*?\\n(?P<body>.*)',

diff --git a/clown_sort/files/pdf_file.py b/clown_sort/files/pdf_file.py
@@ -2,35 +2,42 @@
 Wrapper for PDF files.
 """
 import io
-import logging
+import os
+from pathlib import Path
 from typing import Optional
 
 from PIL import Image
-from pypdf import PdfReader
+from pypdf import PdfReader, PdfWriter
 from pypdf.errors import DependencyError, EmptyFileError
 from rich.console import Console
 from rich.panel import Panel
+from rich.text import Text
 
 from clown_sort.config import Config, check_for_pymupdf, log_optional_module_warning
 from clown_sort.files.image_file import ImageFile
 from clown_sort.files.sortable_file import SortableFile
-from clown_sort.util.constants import MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR
+from clown_sort.lib.page_range import PageRange
+from clown_sort.util.constants import MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR, PDF_ERRORS
+from clown_sort.util.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
 from clown_sort.util.logging import log
-from clown_sort.util.rich_helper import WARNING, console, stderr_console
+from clown_sort.util.rich_helper import WARNING, attention_getting_panel, console, mild_warning, stderr_console
+from clown_sort.util.string_helper import exception_str
 
+DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath(PDF_ERRORS)
 MAX_DISPLAY_HEIGHT = 600
 SCALE_FACTOR = 0.4
 
 
 class PdfFile(SortableFile):
     is_presentable_in_popup = None
 
-    def extracted_text(self) -> Optional[str]:
+    def extracted_text(self, page_range: Optional[PageRange] = None) -> Optional[str]:
         """Use Tesseract to OCR the text in the image, which is returned as a string."""
         if self.text_extraction_attempted:
             return self._extracted_text
 
         log.debug(f"Extracting text from '{self.file_path}'...")
+        self.page_numbers_of_errors = []
         extracted_pages = []
 
         try:
@@ -39,6 +46,10 @@ def extracted_text(self) -> Optional[str]:
             log.debug(f"PDF Page count: {page_count}")
 
             for page_number, page in enumerate(pdf_reader.pages, start=1):
+                if page_range and not page_range.in_range(page_number):
+                    self._log_to_stderr(f"Skipping page {page_number}...")
+                    continue
+
                 self._log_to_stderr(f"Parsing page {page_number}...")
                 page_buffer = Console(file=io.StringIO())
                 page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
@@ -54,20 +65,24 @@ def extracted_text(self) -> Optional[str]:
                         image_obj = Image.open(io.BytesIO(image.data))
                         image_text = ImageFile.extract_text(image_obj, f"{self.file_path} ({image_name})")
                         page_buffer.print((image_text or '').strip())
-                except NotImplementedError as e:
-                    stderr_console.print(f"WARNING: {type(e).__name__}: {e} while parsing embedded image {image_number} on page {page_number}...")
+                except (OSError, NotImplementedError, TypeError, ValueError) as e:
+                    error_str = exception_str(e)
+                    msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
+                    mild_warning(msg)
 
+                    # Dump an error PDF and encourage user to report to pypdf team.
                     if 'JBIG2Decode' not in str(e):
                         stderr_console.print_exception()
-                except (OSError, TypeError, ValueError) as e:
-                    stderr_console.print(f"WARNING: {type(e).__name__}: {e} while parsing embedded image {image_number} on page {page_number}...")
-                    stderr_console.print_exception()
+
+                        if page_number not in self.page_numbers_of_errors:
+                            self._handle_extraction_error(page_number, error_str)
+                            self.page_numbers_of_errors.append(page_number)
 
                 page_text = page_buffer.file.getvalue()
                 extracted_pages.append(page_text)
                 log.debug(page_text)
 
-                if Config.print_when_parsed:
+                if Config.print_as_parsed:
                     print(f"{page_text}")
         except DependencyError:
             log_optional_module_warning('pdf')
@@ -104,6 +119,40 @@ def thumbnail_bytes(self) -> Optional[bytes]:
 
         return page.get_pixmap(matrix=zoom_matrix, clip= clip, alpha=False).tobytes()
 
+    def extract_page_range(
+            self,
+            page_range: PageRange,
+            destination_dir: Optional[Path] = None,
+            extra_file_suffix: Optional[str] = None
+        ) -> Path:
+        """Extract a range of pages to a new PDF file (or 1 page if last_page_number not provided.)"""
+        destination_dir = destination_dir or DEFAULT_PDF_ERRORS_DIR
+        create_dir_if_it_does_not_exist(destination_dir)
+
+        if extra_file_suffix is None:
+            file_suffix = page_range.file_suffix()
+        else:
+            file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
+
+        extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
+        extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
+        stderr_console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
+        pdf_writer = PdfWriter()
+
+        with open(self.file_path, 'rb') as source_pdf:
+            pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
+
+        if SortableFile.confirm_file_overwrite(extracted_pages_pdf_path):
+            with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
+                pdf_writer.write(extracted_pages_pdf)
+
+        stderr_console.print(f"Wrote new PDF '{extracted_pages_pdf_path}'.")
+        return extracted_pages_pdf_path
+
+    def print_extracted_text(self, page_range: Optional[PageRange] = None) -> None:
+        console.print(self._filename_panel())
+        console.print(self.extracted_text(page_range=page_range))
+
     def _can_be_presented_in_popup(self) -> bool:
         if type(self).is_presentable_in_popup is None:
             type(self).is_presentable_in_popup = check_for_pymupdf()
@@ -122,5 +171,29 @@ def _log_to_stderr(self, msg: str) -> None:
 
         stderr_console.print(msg)
 
+    def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
+        """Rip the offending page to a new file and suggest that user report bug to PyPDF."""
+        if 'pdf_errors_dir' in dir(Config):
+            destination_dir = Config.pdf_errors_dir
+        else:
+            destination_dir = DEFAULT_PDF_ERRORS_DIR
+
+        extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
+
+        blink_txt = Text('', style='bright_white')
+        blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
+        blink_txt.append(') ', style='blink color(154)')
+        blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')
+
+        txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
+        txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
+        txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')
+
+        txt.append('The offending page will be extracted to ', style='bright_white')
+        txt.append(str(extracted_file), style='file').append('.\n\n')
+        txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
+        txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
+        stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))
+
     def __repr__(self) -> str:
         return f"PdfFile('{self.file_path}')"