Skip to content

Commit

Permalink
extract offending pages when PyPDF throws an error (also a small PDF …
Browse files Browse the repository at this point in the history
…page extraction script) (#6)

* PdfFile.extract_page_rage()

* refactor overwrite confirmations

* adjust to single page if 0 requested

* Better parser for PDF page extract

* Looking good

* Page ranges work on extracting text

* Make it work

* attention_getting_panel()

* .gitignore

* ICP rules

* README

* style

* Check dir exists before create

* Update help screenshots

* fix image in readme

* comment

* cruft

---------

Co-authored-by: ashariyar <[email protected]>
  • Loading branch information
michelcrypt4d4mus and ashariyar authored Jul 30, 2023
1 parent fb14f89 commit 0993917
Show file tree
Hide file tree
Showing 23 changed files with 395 additions and 118 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down Expand Up @@ -132,3 +131,4 @@ dmypy.json
tests/tmp/
.clown_sort
custom_sort_rules.csv
PDF Errors/
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# NEXT RELEASE
* New script `extract_pages_from_pdf` lets you easily rip pages out of a PDF
* Add `--page-range` argument to both `extract_pages_from_pdf` and `extract_text_from_files`
* `pypdf` exceptions will trigger the offending page to be extracted and a suggestion made to the user that they submit the page to the `pypdf` team
* Bump `pypdf` to version 3.14.0 (fixes for many bugs on edge case PDFs)
* Better handling of sort rules that fail to parse
* Suppress `/JBIG2Decode` warning output
* New crypto sort rules
* Rename `--print-when-parsed` command line option to `--print-as-parsed`
* Suppress `/JBIG2Decode` warning output when decoding PDFs
* Refactor overwrite confirmation, use stderr

### 1.9.2
* Allow `Pillow` 10.0.0
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ pipx install clown_sort[gui]


## One Off Extractions
`extract_text_from_files` (installed at the same time) is a convenient script you can use to extract text from a single file, multiple files, or all the files in a given directory using Google's best in class Tesseract library.
There are several utilities / convenience scripts / whatever you want to call them that are installed along with `clown_sort`. They work with the same toolset as `sort_screenshots` but use it to simplify the extraction of text from images (or PDFs that are actually just a set of page sized images).

#### Extracting text
`extract_text_from_files` is a convenient script you can use to extract text from a single file, multiple files, or all the files in a given directory using Google's best in class Tesseract library.

![](doc/extract_text_from_files_help.png)

Expand All @@ -148,6 +151,10 @@ extract_text_from_files MY_FILE1 MY_FILE2 SOME_DIR3

This will parse and display the text in `MY_FILE1`, `MY_FILE2`, and all the files in `SOME_DIR3`.

#### Extracting pages of a PDF to a new PDF
`extract_pages_from_pdf` is a small script that can extract page ranges (e.g. "10-25") from PDFs on the command line.
![](doc/extract_pages_from_pdf_help.png)


# Contributing
Feel free to file issues or open pull requests.
Expand Down
33 changes: 17 additions & 16 deletions clown_sort/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
"""
Entry point for all of the clown_sort scripts.
"""
from argparse import Namespace
from glob import glob
from os import environ, getcwd, path
Expand All @@ -14,7 +16,7 @@
load_dotenv(dotenv_path=dotenv_file)
break

from clown_sort.util.argument_parser import extraction_parser
from clown_sort.util.argument_parser import parse_text_extraction_args, parse_pdf_page_extraction_args
from clown_sort.config import Config
from clown_sort.files.image_file import ImageFile
from clown_sort.files.pdf_file import PdfFile
Expand Down Expand Up @@ -48,31 +50,30 @@ def extract_text_from_files() -> None:
Extract text from a single file or from all files in a given directory. Can accept
multiple paths as arguments on the command line.
"""
args: Namespace = extraction_parser.parse_args()
args: Namespace = parse_text_extraction_args()
console.line()
files_to_process = []

if args.debug:
Config.enable_debug_mode()
if args.print_when_parsed:
Config.print_when_parsed = True
if args.print_as_parsed:
Config.print_as_parsed = True

for file_or_dir in args.file_or_dir:
file_path = Path(file_or_dir)
for file_path in args.files_to_process:
sortable_file = build_sortable_file(file_path)

if not file_path.exists():
console.print(f"File '{file_path}' doesn't exist!")
sys.exit(-1)
elif file_path.is_dir():
files_to_process.extend(files_in_dir(file_path))
if isinstance(sortable_file, PdfFile):
sortable_file.print_extracted_text(page_range=args.page_range)
else:
files_to_process.append(file_path)
sortable_file.print_extracted_text()

for file_path in files_to_process:
build_sortable_file(file_path).print_extracted_text()
console.line(2)


def extract_pages_from_pdf() -> None:
args = parse_pdf_page_extraction_args()
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)


def set_screenshot_timestamps_from_filenames():
"""Parse the filenames to reset the file creation timestamps."""
Config.configure()
Expand Down
15 changes: 7 additions & 8 deletions clown_sort/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import re
import sys
from argparse import Namespace
from collections import namedtuple
from importlib.metadata import version
from os import environ
from pathlib import Path
Expand All @@ -19,7 +18,7 @@
from clown_sort.util import rich_helper
from clown_sort.util.argument_parser import parser
from clown_sort.util.constants import PACKAGE_NAME
from clown_sort.util.filesystem_helper import subdirs_of_dir
from clown_sort.util.filesystem_helper import create_dir_if_it_does_not_exist, subdirs_of_dir
from clown_sort.util.logging import log, set_log_level


Expand All @@ -34,7 +33,7 @@ class Config:
delete_originals: bool = False
rescan_sorted: bool = False
yes_overwrite: bool = False
print_when_parsed: bool = False
print_as_parsed: bool = False
sort_rules: List[SortRule] = []
filename_regex: re.Pattern

Expand Down Expand Up @@ -119,11 +118,10 @@ def set_directories(
cls.destination_dir: Path = Path(destination_dir or screenshots_dir)
cls.sorted_screenshots_dir = cls.destination_dir.joinpath('Sorted')
cls.processed_screenshots_dir = cls.destination_dir.joinpath('Processed')
cls.pdf_errors_dir = cls.destination_dir.joinpath('PDF Errors')

for dir in [cls.destination_dir, cls.sorted_screenshots_dir, cls.processed_screenshots_dir]:
if not dir.is_dir():
log.warning(f"Need to create '{dir}'")
dir.mkdir(parents=True, exist_ok=True)
create_dir_if_it_does_not_exist(dir)

cls._log_configured_paths()

Expand Down Expand Up @@ -163,6 +161,7 @@ def _log_configured_paths(cls) -> None:
log.debug(f"destination_dir: {cls.destination_dir}")
log.debug(f"sorted_screenshots_dir: {cls.sorted_screenshots_dir}")
log.debug(f"processed_screenshots_dir: {cls.processed_screenshots_dir}")
log.debug(f"pdf_errors_dir: {cls.pdf_errors_dir}")


def _check_for_pysimplegui():
Expand All @@ -177,7 +176,7 @@ def _check_for_pysimplegui():
)

log_optional_module_warning('gui', msg)
console = Console()
console = Console(color_system='256')
#console.line()
console.print(f"You make also need to install 'python-tk'. In macOS this can be installed with 'brew install python-tk'.")
sys.exit()
Expand All @@ -201,7 +200,7 @@ def log_optional_module_warning(module_name: str, msg: Optional[Text] = None) ->
style='bright_white'
)

console = Console()
console = Console(color_system='256')
console.line()
console.print(msg)
console.line()
Expand Down
5 changes: 1 addition & 4 deletions clown_sort/filename_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@
from difflib import SequenceMatcher
from typing import Optional

from rich.text import Text

from clown_sort.util.logging import log
from clown_sort.util.rich_helper import console
from clown_sort.util.string_helper import strip_bad_chars, strip_mac_screenshot
from clown_sort.util.filesystem_helper import strip_bad_chars, strip_mac_screenshot

TWEET_REPLY_REGEX = re.compile(
'Replying to (@\\w{3,15}).*?\\n(?P<body>.*)',
Expand Down
95 changes: 84 additions & 11 deletions clown_sort/files/pdf_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,42 @@
Wrapper for PDF files.
"""
import io
import logging
import os
from pathlib import Path
from typing import Optional

from PIL import Image
from pypdf import PdfReader
from pypdf import PdfReader, PdfWriter
from pypdf.errors import DependencyError, EmptyFileError
from rich.console import Console
from rich.panel import Panel
from rich.text import Text

from clown_sort.config import Config, check_for_pymupdf, log_optional_module_warning
from clown_sort.files.image_file import ImageFile
from clown_sort.files.sortable_file import SortableFile
from clown_sort.util.constants import MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR
from clown_sort.lib.page_range import PageRange
from clown_sort.util.constants import MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR, PDF_ERRORS
from clown_sort.util.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
from clown_sort.util.logging import log
from clown_sort.util.rich_helper import WARNING, console, stderr_console
from clown_sort.util.rich_helper import WARNING, attention_getting_panel, console, mild_warning, stderr_console
from clown_sort.util.string_helper import exception_str

DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath(PDF_ERRORS)
MAX_DISPLAY_HEIGHT = 600
SCALE_FACTOR = 0.4


class PdfFile(SortableFile):
is_presentable_in_popup = None

def extracted_text(self) -> Optional[str]:
def extracted_text(self, page_range: Optional[PageRange] = None) -> Optional[str]:
"""Use Tesseract to OCR the text in the image, which is returned as a string."""
if self.text_extraction_attempted:
return self._extracted_text

log.debug(f"Extracting text from '{self.file_path}'...")
self.page_numbers_of_errors = []
extracted_pages = []

try:
Expand All @@ -39,6 +46,10 @@ def extracted_text(self) -> Optional[str]:
log.debug(f"PDF Page count: {page_count}")

for page_number, page in enumerate(pdf_reader.pages, start=1):
if page_range and not page_range.in_range(page_number):
self._log_to_stderr(f"Skipping page {page_number}...")
continue

self._log_to_stderr(f"Parsing page {page_number}...")
page_buffer = Console(file=io.StringIO())
page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
Expand All @@ -54,20 +65,24 @@ def extracted_text(self) -> Optional[str]:
image_obj = Image.open(io.BytesIO(image.data))
image_text = ImageFile.extract_text(image_obj, f"{self.file_path} ({image_name})")
page_buffer.print((image_text or '').strip())
except NotImplementedError as e:
stderr_console.print(f"WARNING: {type(e).__name__}: {e} while parsing embedded image {image_number} on page {page_number}...")
except (OSError, NotImplementedError, TypeError, ValueError) as e:
error_str = exception_str(e)
msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
mild_warning(msg)

# Dump an error PDF and encourage user to report to pypdf team.
if 'JBIG2Decode' not in str(e):
stderr_console.print_exception()
except (OSError, TypeError, ValueError) as e:
stderr_console.print(f"WARNING: {type(e).__name__}: {e} while parsing embedded image {image_number} on page {page_number}...")
stderr_console.print_exception()

if page_number not in self.page_numbers_of_errors:
self._handle_extraction_error(page_number, error_str)
self.page_numbers_of_errors.append(page_number)

page_text = page_buffer.file.getvalue()
extracted_pages.append(page_text)
log.debug(page_text)

if Config.print_when_parsed:
if Config.print_as_parsed:
print(f"{page_text}")
except DependencyError:
log_optional_module_warning('pdf')
Expand Down Expand Up @@ -104,6 +119,40 @@ def thumbnail_bytes(self) -> Optional[bytes]:

return page.get_pixmap(matrix=zoom_matrix, clip= clip, alpha=False).tobytes()

def extract_page_range(
self,
page_range: PageRange,
destination_dir: Optional[Path] = None,
extra_file_suffix: Optional[str] = None
) -> Path:
"""Extract a range of pages to a new PDF file (or 1 page if last_page_number not provided.)"""
destination_dir = destination_dir or DEFAULT_PDF_ERRORS_DIR
create_dir_if_it_does_not_exist(destination_dir)

if extra_file_suffix is None:
file_suffix = page_range.file_suffix()
else:
file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"

extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
stderr_console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
pdf_writer = PdfWriter()

with open(self.file_path, 'rb') as source_pdf:
pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())

if SortableFile.confirm_file_overwrite(extracted_pages_pdf_path):
with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
pdf_writer.write(extracted_pages_pdf)

stderr_console.print(f"Wrote new PDF '{extracted_pages_pdf_path}'.")
return extracted_pages_pdf_path

def print_extracted_text(self, page_range: Optional[PageRange] = None) -> None:
console.print(self._filename_panel())
console.print(self.extracted_text(page_range=page_range))

def _can_be_presented_in_popup(self) -> bool:
if type(self).is_presentable_in_popup is None:
type(self).is_presentable_in_popup = check_for_pymupdf()
Expand All @@ -122,5 +171,29 @@ def _log_to_stderr(self, msg: str) -> None:

stderr_console.print(msg)

def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
"""Rip the offending page to a new file and suggest that user report bug to PyPDF."""
if 'pdf_errors_dir' in dir(Config):
destination_dir = Config.pdf_errors_dir
else:
destination_dir = DEFAULT_PDF_ERRORS_DIR

extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)

blink_txt = Text('', style='bright_white')
blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
blink_txt.append(') ', style='blink color(154)')
blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')

txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')

txt.append('The offending page will be extracted to ', style='bright_white')
txt.append(str(extracted_file), style='file').append('.\n\n')
txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))

def __repr__(self) -> str:
return f"PdfFile('{self.file_path}')"
Loading

0 comments on commit 0993917

Please sign in to comment.