From 4bd54bd32d525ec41e23960146fbb3b0418a4715 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 28 Jul 2024 17:16:57 +0200 Subject: [PATCH 01/42] DEV: Test against Python 3.13 (#2776) * DEV: Test against Python 3.13 * fix typo * add missing setup-python * fix another typo * update Pillow version * attempt to update coverage package * update number of expected coverage files --- .github/workflows/github-ci.yaml | 10 +++++----- requirements/ci-3.11.txt | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 820ccdcaa..1eb3d9bd0 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -57,7 +57,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] use-crypto-lib: ["cryptography"] include: - python-version: "3.7" @@ -90,7 +90,7 @@ jobs: cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) uses: actions/setup-python@v5 - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' with: python-version: ${{ matrix.python-version }} allow-prereleases: true @@ -106,7 +106,7 @@ jobs: - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' - name: Remove pycryptodome and cryptography run: | pip uninstall pycryptodome cryptography -y @@ -215,8 +215,8 @@ jobs: - name: Check Number of Downloaded Files run: | downloaded_files_count=$(find \.coverage* -type f | wc -l) - if [ $downloaded_files_count -eq 8 ]; then - echo "The expected number of files (8) were downloaded." + if [ $downloaded_files_count -eq 9 ]; then + echo "The expected number of files (9) were downloaded." else echo "ERROR: Expected 8 files, but found $downloaded_files_count files." exit 1 diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index f382fe2b9..210177118 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -6,7 +6,7 @@ # attrs==23.1.0 # via flake8-bugbear -coverage[toml]==7.3.0 +coverage[toml]==7.6.0 # via # -r requirements/ci.in # pytest-cov @@ -35,7 +35,7 @@ mypy-extensions==1.0.0 # via mypy packaging==23.1 # via pytest -pillow==10.0.1 +pillow==10.4.0 # via # -r requirements/ci.in # fpdf2 From d4df20d14cb6a2839c1ab141b51e70652fb3d1f1 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:46:08 +0100 Subject: [PATCH 02/42] STY: Remove boolean value comparison (#2779) PEP 8 recommendation. --- pypdf/annotations/_markup_annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py index 4db8dfdbf..98a222483 100644 --- a/pypdf/annotations/_markup_annotations.py +++ b/pypdf/annotations/_markup_annotations.py @@ -104,9 +104,9 @@ def __init__( self[NameObject("/Rect")] = RectangleObject(rect) font_str = "font: " - if bold is True: + if bold: font_str = f"{font_str}bold " - if italic is True: + if italic: font_str = f"{font_str}italic " font_str = f"{font_str}{font} {font_size}" font_str = f"{font_str};text-align:left;color:#{font_color}" From 3ad9234c2ec08e7cd6a8b2ec962386eda394d76d Mon Sep 17 00:00:00 2001 From: "William G. Gagnon" Date: Fri, 2 Aug 2024 11:21:53 -0400 Subject: [PATCH 03/42] ROB: Handle images with empty data when processing an image from bytes (#2786) Closes #2783. --- CONTRIBUTORS.md | 1 + pypdf/_xobj_image_helpers.py | 9 ++++++--- pypdf/errors.py | 4 ++++ tests/test_xobject_image_helpers.py | 13 +++++++++++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 84f0b6ee4..89fec3b14 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) +* [Gagnon, William G.](https://github.com/williamgagnon) * [Górny, Michał](https://github.com/mgorny) * [Grillo, Miguel](https://github.com/Ineffable22) * [Gutteridge, David H.](https://github.com/dhgutteridge) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 45b0c145b..5ae8894fa 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -6,7 +6,7 @@ from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces -from .errors import PdfReadError +from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, DecodedStreamObject, @@ -148,9 +148,12 @@ def _extended_image_frombytes( img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] - if len(data) % nb_pix != 0: + data_length = len(data) + if data_length == 0: + raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc + if data_length % nb_pix != 0: raise exc - k = nb_pix * len(mode) / len(data) + k = nb_pix * len(mode) / data_length data = b"".join([bytes((x,) * int(k)) for x in data]) img = Image.frombytes(mode, size, data) return img diff --git a/pypdf/errors.py b/pypdf/errors.py index c962dec66..ad197ffc1 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" +class EmptyImageDataError(PyPdfError): + """Raised when trying to process an image that has no data.""" + + STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py index 63ecebd9b..39b7131fc 100644 --- a/tests/test_xobject_image_helpers.py +++ b/tests/test_xobject_image_helpers.py @@ -4,8 +4,8 @@ import pytest from pypdf import PdfReader -from pypdf._xobj_image_helpers import _handle_flate -from pypdf.errors import PdfReadError +from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate +from pypdf.errors import EmptyImageDataError, PdfReadError from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject from . import get_data_from_url @@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1(): colors=2, obj_as_text="dummy", ) + + +def test_extended_image_frombytes_zero_data(): + mode = "RGB" + size = (1, 1) + data = b"" + + with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."): + _extended_image_frombytes(mode, size, data) From 582557e09a7e658fdcb19f26eb069d87875489f0 Mon Sep 17 00:00:00 2001 From: Diogo Teles Sant'Anna Date: Fri, 2 Aug 2024 15:49:29 -0300 Subject: [PATCH 04/42] SEC: Fix GitHub workflow vulnerable to script injection (#2787) Signed-off-by: Diogo Teles Sant'Anna --- .github/workflows/release.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9f782ec08..b1a4fb27f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -12,6 +12,9 @@ on: permissions: contents: write +env: + HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + jobs: build_and_publish: name: Publish a new version @@ -24,7 +27,7 @@ jobs: - name: Extract version from commit message id: extract_version run: | - VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') + VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Extract tag message from commit message @@ -32,7 +35,7 @@ jobs: run: | VERSION="${{ steps.extract_version.outputs.version }}" delimiter="$(openssl rand -hex 8)" - MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" ) + MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) echo "message<<${delimiter}" >> $GITHUB_OUTPUT echo "$MESSAGE" >> $GITHUB_OUTPUT echo "${delimiter}" >> $GITHUB_OUTPUT From 38f3925502c2971ad587fb616500b6f8b6333d03 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:10:47 +0100 Subject: [PATCH 05/42] MAINT: Remove unused paeth_predictor (#2773) --- pypdf/_utils.py | 14 -------------- tests/test_utils.py | 18 ------------------ 2 files changed, 32 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 38c0d67d7..6569707b6 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -390,20 +390,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" -def paeth_predictor(left: int, up: int, up_left: int) -> int: - p = left + up - up_left - dist_left = abs(p - left) - dist_up = abs(p - up) - dist_up_left = abs(p - up_left) - - if dist_left <= dist_up and dist_left <= dist_up_left: - return left - elif dist_up <= dist_up_left: - return up - else: - return up_left - - def deprecate(msg: str, stacklevel: int = 3) -> None: warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) diff --git a/tests/test_utils.py b/tests/test_utils.py index 81fcf9fb4..856bedd86 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -132,24 +132,6 @@ def test_deprecate_no_replacement(): assert warn[0].message.args[0] == error_msg -@pytest.mark.parametrize( - ("left", "up", "upleft", "expected"), - [ - (0, 0, 0, 0), - (1, 0, 0, 1), - (0, 1, 0, 1), - (0, 0, 1, 0), - (1, 2, 3, 1), - (2, 1, 3, 1), - (1, 3, 2, 2), - (3, 1, 2, 2), - (3, 2, 1, 3), - ], -) -def test_paeth_predictor(left, up, upleft, expected): - assert pypdf._utils.paeth_predictor(left, up, upleft) == expected - - @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ From 09f9b7ed52193bfd9e98bdd018ccaf7cbe821687 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:49:29 +0100 Subject: [PATCH 06/42] MAINT: Remove unused AnnotationFlag --- pypdf/annotations/_non_markup_annotations.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py index dcdb3b0ff..6272cceee 100644 --- a/pypdf/annotations/_non_markup_annotations.py +++ b/pypdf/annotations/_non_markup_annotations.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union -from ..constants import AnnotationFlag from ..generic._base import ( BooleanObject, NameObject, @@ -12,8 +11,6 @@ from ..generic._rectangle import RectangleObject from ._base import AnnotationDictionary -DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0) - class Link(AnnotationDictionary): def __init__( From b2d72043ab5221b58138c7d06c181b8cbc88ea8e Mon Sep 17 00:00:00 2001 From: owurman Date: Mon, 5 Aug 2024 12:14:18 -0700 Subject: [PATCH 07/42] BUG: Handle Sequence as an IndirectObject when extracting text with layout mode (#2788) * Handle Sequence as an IndirectObject The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility. * Update pypdf/_text_extraction/_layout_mode/_font.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> * Address PR comments -Rename w_1 to w_next_entry -Utilize ParseError instead of PdfReadError -Write a test (both positive and negative) * Handle unlikely case of IndirectObjects for float/int width elements Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError. * Yoda condition I removed * Last commit was a bad patch, confused by non-committed changes * Use test files from URL rather than resources * Update tests/test_text_extraction.py Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> * Fix code style warnings in range() call --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- pypdf/_text_extraction/_layout_mode/_font.py | 26 +++++++++++++------- tests/test_text_extraction.py | 17 +++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index a912fddb2..40655b1b2 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -1,8 +1,9 @@ """Font constants and classes for "layout" mode text operations""" from dataclasses import dataclass, field -from typing import Any, Dict, Sequence, Union +from typing import Any, Dict, Sequence, Union, cast +from ...errors import ParseError from ...generic import IndirectObject from ._font_widths import STANDARD_WIDTHS @@ -58,6 +59,7 @@ def __post_init__(self) -> None: skip_count = 0 _w = d_font.get("/W", []) for idx, w_entry in enumerate(_w): + w_entry = w_entry.get_object() if skip_count: skip_count -= 1 continue @@ -66,13 +68,14 @@ def __post_init__(self) -> None: # warning and or use reader's "strict" to force an ex??? continue # check for format (1): `int [int int int int ...]` - if isinstance(_w[idx + 1], Sequence): - start_idx, width_list = _w[idx : idx + 2] + w_next_entry = _w[idx + 1].get_object() + if isinstance(w_next_entry, Sequence): + start_idx, width_list = w_entry, w_next_entry self.width_map.update( { ord_map[_cidx]: _width for _cidx, _width in zip( - range(start_idx, start_idx + len(width_list), 1), + range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), width_list, ) if _cidx in ord_map @@ -80,18 +83,23 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - if not isinstance(_w[idx + 1], Sequence) and not isinstance( - _w[idx + 2], Sequence - ): - start_idx, stop_idx, const_width = _w[idx : idx + 3] + elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): + start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(start_idx, stop_idx + 1, 1) + for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) if _cidx in ord_map } ) skip_count = 2 + else: + # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions + # while expecting more elements). This raises an IndexError which is sufficient. + raise ParseError( + f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" + ) # pragma: no cover + if not self.width_map and "/BaseFont" in self.font_dictionary: for key in STANDARD_WIDTHS: if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 1ffa68a3e..dcd4e6cae 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -10,6 +10,7 @@ from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl +from pypdf.errors import ParseError from . import get_data_from_url @@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths(): encoding="utf-8" ) assert expected == reader.pages[0].extract_text(extraction_mode="layout") + + +@pytest.mark.enable_socket() +def test_layout_mode_indirect_sequence_font_widths(): + # Cover the situation where the sequence for font widths is an IndirectObject + # ref https://github.com/py-pdf/pypdf/pull/2788 + url = "https://github.com/user-attachments/files/16491621/2788_example.pdf" + name ="2788_example.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].extract_text(extraction_mode="layout") == "" + url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf" + name = "2788_example_malformed.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + with pytest.raises(ParseError) as exc: + reader.pages[0].extract_text(extraction_mode="layout") + assert str(exc.value).startswith("Invalid font width definition") From 5abd590740a2718fc69b8477c656ce5515a0ab33 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:14:17 +0100 Subject: [PATCH 08/42] STY: Refactor b_ (#2772) --- pypdf/_utils.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 6569707b6..5fecb38e7 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -347,14 +347,11 @@ def b_(s: Union[str, bytes]) -> bytes: return bc[s] try: r = s.encode("latin-1") - if len(s) < 2: - bc[s] = r - return r - except Exception: + except UnicodeEncodeError: r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r + if len(s) < 2: + bc[s] = r + return r def str_(b: Any) -> str: From 219eb13f7eb9c2cd9519e9a69d639250853bd823 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:52:40 +0200 Subject: [PATCH 09/42] MAINT: Drop Python 3.7 support (#2793) --- .github/workflows/github-ci.yaml | 14 +++++++------- pypdf/_page.py | 8 +------- pypdf/_protocols.py | 8 +------- .../_layout_mode/_fixed_width_page.py | 8 +------- pypdf/_xobj_image_helpers.py | 13 ++++--------- pypdf/types.py | 8 +------- 6 files changed, 15 insertions(+), 44 deletions(-) diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 1eb3d9bd0..d5d9bb4d4 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -57,12 +57,12 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] use-crypto-lib: ["cryptography"] include: - - python-version: "3.7" + - python-version: "3.8" use-crypto-lib: "pycryptodome" - - python-version: "3.7" + - python-version: "3.8" use-crypto-lib: "none" steps: - name: Update APT packages @@ -83,7 +83,7 @@ jobs: key: cache-downloaded-files - name: Setup Python uses: actions/setup-python@v5 - if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' + if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -102,7 +102,7 @@ jobs: - name: Install requirements (Python 3) run: | pip install -r requirements/ci.txt - if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' + if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt @@ -215,8 +215,8 @@ jobs: - name: Check Number of Downloaded Files run: | downloaded_files_count=$(find \.coverage* -type f | wc -l) - if [ $downloaded_files_count -eq 9 ]; then - echo "The expected number of files (9) were downloaded." + if [ $downloaded_files_count -eq 8 ]; then + echo "The expected number of files (8) were downloaded." else echo "ERROR: Expected 8 files, but found $downloaded_files_count files." exit 1 diff --git a/pypdf/_page.py b/pypdf/_page.py index 63038d9d0..ee1dc7f60 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import sys from decimal import Decimal from pathlib import Path from typing import ( @@ -38,6 +37,7 @@ Iterable, Iterator, List, + Literal, Optional, Sequence, Set, @@ -85,12 +85,6 @@ StreamObject, ) -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - - MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 9f413660b..b5fa14879 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -2,13 +2,7 @@ from abc import abstractmethod from pathlib import Path -from typing import IO, Any, Dict, List, Optional, Tuple, Union - -try: - # Python 3.8+: https://peps.python.org/pep-0586 - from typing import Protocol -except ImportError: - from typing_extensions import Protocol # type: ignore[assignment] +from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union from ._utils import StrByteType, StreamType diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py index 1be500959..e7af1b234 100644 --- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py +++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -1,10 +1,9 @@ """Extract PDF text preserving the layout of the source PDF""" -import sys from itertools import groupby from math import ceil from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict from ..._utils import logger_warning from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS @@ -12,11 +11,6 @@ from ._text_state_manager import TextStateManager from ._text_state_params import TextStateParams -if sys.version_info >= (3, 8): - from typing import Literal, TypedDict -else: - from typing_extensions import Literal, TypedDict - class BTGroup(TypedDict): """ diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 5ae8894fa..7a3f40d95 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -2,7 +2,7 @@ import sys from io import BytesIO -from typing import Any, List, Tuple, Union, cast +from typing import Any, List, Literal, Tuple, Union, cast from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces @@ -15,13 +15,6 @@ NullObject, ) -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - # PEP 586 introduced typing.Literal with Python 3.8 - # For older Python versions, the backport typing_extensions is necessary: - from typing_extensions import Literal - if sys.version_info[:2] >= (3, 10): from typing import TypeAlias else: @@ -150,7 +143,9 @@ def _extended_image_frombytes( nb_pix = size[0] * size[1] data_length = len(data) if data_length == 0: - raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc + raise EmptyImageDataError( + "Data is 0 bytes, cannot process an image from empty data." + ) from exc if data_length % nb_pix != 0: raise exc k = nb_pix * len(mode) / data_length diff --git a/pypdf/types.py b/pypdf/types.py index b8fbab92c..e383dc7b1 100644 --- a/pypdf/types.py +++ b/pypdf/types.py @@ -1,13 +1,7 @@ """Helpers for working with PDF types.""" import sys -from typing import List, Union - -if sys.version_info[:2] >= (3, 8): - # Python 3.8+: https://peps.python.org/pep-0586 - from typing import Literal -else: - from typing_extensions import Literal +from typing import List, Literal, Union if sys.version_info[:2] >= (3, 10): # Python 3.10+: https://www.python.org/dev/peps/pep-0484 From 46c89dd8e1e8641a49624f3fbc1865f9c4b41374 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:57:25 +0200 Subject: [PATCH 10/42] MAINT: Remove b_ and str_ (#2792) Closes #2726. Closes #2791. --- pypdf/_cmap.py | 30 +++---- pypdf/_doc_common.py | 3 +- pypdf/_encryption.py | 6 +- pypdf/_merger.py | 10 ++- pypdf/_page.py | 16 ++-- pypdf/_reader.py | 5 +- pypdf/_utils.py | 34 ++------ pypdf/_writer.py | 6 +- pypdf/filters.py | 53 ++++++++---- pypdf/generic/_base.py | 39 +++++---- pypdf/generic/_data_structures.py | 35 ++++---- pypdf/generic/_utils.py | 77 ++++++++++-------- ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217096 -> 217093 bytes tests/test_cmap.py | 18 +++- tests/test_page.py | 13 ++- tests/test_utils.py | 16 ---- tests/test_workflows.py | 4 +- tests/test_writer.py | 2 +- 18 files changed, 193 insertions(+), 174 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9a2d10a61..d635724d2 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -3,11 +3,10 @@ from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding -from ._utils import b_, logger_error, logger_warning +from ._utils import logger_error, logger_warning from .generic import ( DecodedStreamObject, DictionaryObject, - IndirectObject, NullObject, StreamObject, ) @@ -258,7 +257,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): - cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) + cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() elif isinstance(tu, str) and tu.startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" @@ -448,34 +447,27 @@ def compute_space_width( en: int = cast(int, ft["/LastChar"]) if st > space_code or en < space_code: raise Exception("Not in range") - if w[space_code - st] == 0: + if w[space_code - st].get_object() == 0: raise Exception("null width") - sp_width = w[space_code - st] + sp_width = w[space_code - st].get_object() except Exception: if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore else: # will consider width of char as avg(width)/2 m = 0 cpt = 0 - for x in w: - if x > 0: - m += x + for xx in w: + xx = xx.get_object() + if xx > 0: + m += xx cpt += 1 sp_width = m / max(1, cpt) / 2 - if isinstance(sp_width, IndirectObject): - # According to - # 'Table 122 - Entries common to all font descriptors (continued)' - # the MissingWidth should be a number, but according to #2286 it can - # be an indirect object - obj = sp_width.get_object() - if obj is None or isinstance(obj, NullObject): - return 0.0 - return obj # type: ignore - + if sp_width is None or isinstance(sp_width, NullObject): + sp_width = 0.0 return sp_width diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index d4c5c43c3..ffbdb7882 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -49,7 +49,6 @@ from ._page import PageObject, _VirtualList from ._page_labels import index2label as page_index2page_label from ._utils import ( - b_, deprecate_with_replacement, logger_warning, parse_iso8824_date, @@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: - es = zlib.decompress(b_(field._data)) + es = zlib.decompress(field._data) retval[tag] = es return retval diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 5ddd8d0ef..e5cdd9324 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -43,7 +43,7 @@ rc4_encrypt, ) -from ._utils import b_, logger_warning +from ._utils import logger_warning from .generic import ( ArrayObject, ByteStringObject, @@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) - obj2.set_data(self.stm_crypt.encrypt(b_(obj._data))) + obj2.set_data(self.stm_crypt.encrypt(obj._data)) for key, value in obj.items(): # Dont forget the Stream dict. obj2[key] = self.encrypt_object(value) obj = obj2 @@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: data = self.str_crypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): - obj._data = self.stm_crypt.decrypt(b_(obj._data)) + obj._data = self.stm_crypt.decrypt(obj._data) for key, value in obj.items(): # Dont forget the Stream dict. obj[key] = self.decrypt_object(value) elif isinstance(obj, DictionaryObject): diff --git a/pypdf/_merger.py b/pypdf/_merger.py index 7176a1adf..a52a354e3 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -46,7 +46,6 @@ from ._utils import ( StrByteType, deprecate_with_replacement, - str_, ) from ._writer import PdfWriter from .constants import GoToActionArguments, TypArguments, TypFitArguments @@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: self.id = id +# transfered from _utils : as this function is only required here +# and merger will be soon deprecated +def str_(b: Any) -> str: # pragma: no cover + if isinstance(b, bytes): + return b.decode("latin-1") + else: + return str(b) # will return b.__str__() if defined + + class PdfMerger: """ Use :class:`PdfWriter` instead. diff --git a/pypdf/_page.py b/pypdf/_page.py index ee1dc7f60..48cdeb149 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -846,7 +846,7 @@ def _add_transformation_matrix( FloatObject(e), FloatObject(f), ], - " cm", + b"cm", ], ) return contents @@ -864,7 +864,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]: if isinstance(obj, list): return b"".join(x.get_object().get_data() for x in obj) else: - return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + return cast(EncodedStreamObject, obj).get_data() else: return None @@ -1057,11 +1057,11 @@ def _merge_page( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( @@ -1195,11 +1195,11 @@ def _merge_page_writer( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( diff --git a/pypdf/_reader.py b/pypdf/_reader.py index aeababa7b..7c084107c 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -51,7 +51,6 @@ from ._utils import ( StrByteType, StreamType, - b_, logger_warning, read_non_whitespace, read_previous_line, @@ -328,7 +327,7 @@ def _get_object_from_stream( assert cast(str, obj_stm["/Type"]) == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) + stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) stream_data.seek(-1, 1) @@ -932,7 +931,7 @@ def _read_pdf15_xref_stream( xrefstream = cast(ContentStream, read_object(stream, self)) assert cast(str, xrefstream["/Type"]) == "/XRef" self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) + stream_data = BytesIO(xrefstream.get_data()) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 5fecb38e7..94d45cf6d 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -336,31 +336,6 @@ def mark_location(stream: StreamType) -> None: stream.seek(-radius, 1) -B_CACHE: Dict[str, bytes] = {} - - -def b_(s: Union[str, bytes]) -> bytes: - if isinstance(s, bytes): - return s - bc = B_CACHE - if s in bc: - return bc[s] - try: - r = s.encode("latin-1") - except UnicodeEncodeError: - r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r - - -def str_(b: Any) -> str: - if isinstance(b, bytes): - return b.decode("latin-1") - else: - return str(b) # will return b.__str__() if defined - - @overload def ord_(b: str) -> int: ... @@ -397,12 +372,17 @@ def deprecation(msg: str) -> None: def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature will be removed, but has a replacement.""" - deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4) + deprecate( + f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", + 4, + ) def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature was already removed, but has a replacement.""" - deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.") + deprecation( + f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." + ) def deprecate_no_replacement(name: str, removed_in: str) -> None: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 00b9d498c..d73c00e3d 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -62,7 +62,6 @@ StrByteType, StreamType, _get_max_pdf_version_header, - b_, deprecate_with_replacement, logger_warning, ) @@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # Hello world! # endstream # endobj - + if isinstance(data, str): + data = data.encode("latin-1") file_entry = DecodedStreamObject() - file_entry.set_data(b_(data)) + file_entry.set_data(data) file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) # The Filespec entry diff --git a/pypdf/filters.py b/pypdf/filters.py index 137e3603a..43730cc8e 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,7 @@ from ._utils import ( WHITESPACES_AS_BYTES, - b_, + deprecate, deprecate_with_replacement, deprecation_no_replacement, logger_warning, @@ -376,20 +376,18 @@ class LZWDecode: """ Taken from: - http://www.java2s.com/Open-Source/Java-Document/PDF/PDF- - Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ class Decoder: + STOP = 257 + CLEARDICT = 256 + def __init__(self, data: bytes) -> None: - self.STOP = 257 - self.CLEARDICT = 256 self.data = data self.bytepos = 0 self.bitpos = 0 - self.dict = [""] * 4096 - for i in range(256): - self.dict[i] = chr(i) + self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256) self.reset_dict() def reset_dict(self) -> None: @@ -416,7 +414,7 @@ def next_code(self) -> int: self.bytepos = self.bytepos + 1 return value - def decode(self) -> str: + def decode(self) -> bytes: """ TIFF 6.0 specification explains in sufficient details the steps to implement the LZW encode() and decode() algorithms. @@ -429,7 +427,7 @@ def decode(self) -> str: PdfReadError: If the stop code is missing """ cW = self.CLEARDICT - baos = "" + baos = b"" while True: pW = cW cW = self.next_code() @@ -444,11 +442,11 @@ def decode(self) -> str: else: if cW < self.dictlen: baos += self.dict[cW] - p = self.dict[pW] + self.dict[cW][0] + p = self.dict[pW] + self.dict[cW][0:1] self.dict[self.dictlen] = p self.dictlen += 1 else: - p = self.dict[pW] + self.dict[pW][0] + p = self.dict[pW] + self.dict[pW][0:1] baos += p self.dict[self.dictlen] = p self.dictlen += 1 @@ -460,11 +458,11 @@ def decode(self) -> str: return baos @staticmethod - def decode( + def _decodeb( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, - ) -> str: + ) -> bytes: """ Decode an LZW encoded data stream. @@ -476,9 +474,28 @@ def decode( decoded data. """ # decode_parms is unused here - return LZWDecode.Decoder(data).decode() + @staticmethod + def decode( + data: bytes, + decode_parms: Optional[DictionaryObject] = None, + **kwargs: Any, + ) -> str: # deprecated + """ + Decode an LZW encoded data stream. + + Args: + data: ``bytes`` or ``str`` text to decode. + decode_parms: a dictionary of parameter values. + + Returns: + decoded data. + """ + # decode_parms is unused here + deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0") + return LZWDecode.Decoder(data).decode().decode("latin-1") + class ASCII85Decode: """Decodes string ASCII85-encoded data into a byte format.""" @@ -651,7 +668,7 @@ def decode( return tiff_header + data -def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject +def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject """ Decode the stream data based on the specified filters. @@ -678,7 +695,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decodparms, (list, tuple)): decodparms = (decodparms,) - data: bytes = b_(stream._data) + data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: for filter_type, params in zip(filters, decodparms): @@ -691,7 +708,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL): data = RunLengthDecode.decode(data) elif filter_type in (FT.LZW_DECODE, FTA.LZW): - data = LZWDecode.decode(data, params) # type: ignore + data = LZWDecode._decodeb(data, params) elif filter_type in (FT.ASCII_85_DECODE, FTA.A85): data = ASCII85Decode.decode(data) elif filter_type == FT.DCT_DECODE: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 2d606b418..309d389cc 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -30,18 +30,17 @@ import re from binascii import unhexlify from math import log10 +from struct import iter_unpack from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast from .._codecs import _pdfdoc_encoding_rev from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( StreamType, - b_, deprecate_no_replacement, logger_warning, read_non_whitespace, read_until_regex, - str_, ) from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError @@ -308,6 +307,10 @@ def __getitem__(self, key: Any) -> Any: # items should be extracted from pointed Object return self._get_object_with_check()[key] # type: ignore + def __float__(self) -> str: + # in this case we are looking for the pointed data + return self.get_object().__float__() # type: ignore + def __str__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__str__() @@ -369,10 +372,10 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfRe class FloatObject(float, PdfObject): def __new__( - cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None + cls, value: Any = "0.0", context: Optional[Any] = None ) -> "FloatObject": try: - value = float(str_(value)) + value = float(value) return float.__new__(cls, value) except Exception as e: # If this isn't a valid decimal (happens in malformed PDFs) @@ -599,15 +602,16 @@ def write_to_stream( ) bytearr = self.get_encoded_bytes() stream.write(b"(") - for c in bytearr: - if not chr(c).isalnum() and c != b" ": + for c_ in iter_unpack("c", bytearr): + c = cast(bytes, c_[0]) + if not c.isalnum() and c != b" ": # This: # stream.write(rf"\{c:0>3o}".encode()) # gives # https://github.com/davidhalter/parso/issues/207 - stream.write(("\\%03o" % c).encode()) + stream.write(b"\\%03o" % ord(c)) else: - stream.write(b_(chr(c))) + stream.write(c) stream.write(b")") @@ -710,12 +714,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader def encode_pdfdocencoding(unicode_string: str) -> bytes: - retval = bytearray() - for c in unicode_string: - try: - retval += b_(chr(_pdfdoc_encoding_rev[c])) - except KeyError: - raise UnicodeEncodeError( - "pdfdocencoding", c, -1, -1, "does not exist in translation table" - ) - return bytes(retval) + try: + return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) + except KeyError: + raise UnicodeEncodeError( + "pdfdocencoding", + unicode_string, + -1, + -1, + "does not exist in translation table", + ) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 87d688674..399836be5 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -52,7 +52,6 @@ from .._utils import ( WHITESPACES, StreamType, - b_, deprecate_no_replacement, deprecate_with_replacement, logger_warning, @@ -843,7 +842,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None: class StreamObject(DictionaryObject): def __init__(self) -> None: - self._data: Union[bytes, str] = b"" + self._data: bytes = b"" self.decoded_self: Optional[DecodedStreamObject] = None def _clone( @@ -877,7 +876,7 @@ def _clone( pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: return self._data def set_data(self, data: bytes) -> None: @@ -885,7 +884,7 @@ def set_data(self, data: bytes) -> None: def hash_value_data(self) -> bytes: data = super().hash_value_data() - data += b_(self._data) + data += self._data return data def write_to_stream( @@ -955,7 +954,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval[NameObject(SA.FILTER)] = f if params is not None: retval[NameObject(SA.DECODE_PARMS)] = params - retval._data = FlateDecode.encode(b_(self._data), level) + retval._data = FlateDecode.encode(self._data, level) return retval def decode_as_image(self) -> Any: @@ -993,7 +992,7 @@ def __init__(self) -> None: self.decoded_self: Optional[DecodedStreamObject] = None # This overrides the parent method: - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: from ..filters import decode_stream_data if self.decoded_self is not None: @@ -1003,7 +1002,7 @@ def get_data(self) -> Union[bytes, str]: # create decoded object decoded = DecodedStreamObject() - decoded.set_data(b_(decode_stream_data(self))) + decoded.set_data(decode_stream_data(self)) for key, value in list(self.items()): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value @@ -1058,7 +1057,7 @@ def __init__( # The inner list has two elements: # Element 0: List # Element 1: str - self._operations: List[Tuple[Any, Any]] = [] + self._operations: List[Tuple[Any, bytes]] = [] # stream may be a StreamObject or an ArrayObject containing # multiple StreamObjects to be cat'd together. @@ -1069,14 +1068,14 @@ def __init__( if isinstance(stream, ArrayObject): data = b"" for s in stream: - data += b_(s.get_object().get_data()) + data += s.get_object().get_data() if len(data) == 0 or data[-1] != b"\n": data += b"\n" super().set_data(bytes(data)) else: stream_data = stream.get_data() assert stream_data is not None - super().set_data(b_(stream_data)) + super().set_data(stream_data) self.forced_encoding = forced_encoding def clone( @@ -1132,7 +1131,7 @@ def _clone( ignore_fields: """ src_cs = cast("ContentStream", src) - super().set_data(b_(src_cs._data)) + super().set_data(src_cs._data) self.pdf = pdf_dest self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding @@ -1249,10 +1248,10 @@ def get_data(self) -> bytes: for op in operands: op.write_to_stream(new_data) new_data.write(b" ") - new_data.write(b_(operator)) + new_data.write(operator) new_data.write(b"\n") self._data = new_data.getvalue() - return b_(self._data) + return self._data # This overrides the parent method: def set_data(self, data: bytes) -> None: @@ -1262,21 +1261,21 @@ def set_data(self, data: bytes) -> None: @property def operations(self) -> List[Tuple[Any, Any]]: if not self._operations and self._data: - self._parse_content_stream(BytesIO(b_(self._data))) + self._parse_content_stream(BytesIO(self._data)) self._data = b"" return self._operations @operations.setter - def operations(self, operations: List[Tuple[Any, Any]]) -> None: + def operations(self, operations: List[Tuple[Any, bytes]]) -> None: self._operations = operations self._data = b"" def isolate_graphics_state(self) -> None: if self._operations: - self._operations.insert(0, ([], "q")) - self._operations.append(([], "Q")) + self._operations.insert(0, ([], b"q")) + self._operations.append(([], b"Q")) elif self._data: - self._data = b"q\n" + b_(self._data) + b"\nQ\n" + self._data = b"q\n" + self._data + b"\nQ\n" # This overrides the parent method: def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index fdcdc3339..b5ac6632a 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -2,7 +2,7 @@ from typing import Dict, List, Tuple, Union from .._codecs import _pdfdoc_encoding -from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from .._utils import StreamType, logger_warning, read_non_whitespace from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError from ._base import ByteStringObject, TextStringObject @@ -16,7 +16,7 @@ def read_hex_string_from_stream( forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) - txt = "" + arr = [] x = b"" while True: tok = read_non_whitespace(stream) @@ -26,13 +26,37 @@ def read_hex_string_from_stream( break x += tok if len(x) == 2: - txt += chr(int(x, base=16)) + arr.append(int(x, base=16)) x = b"" if len(x) == 1: x += b"0" - if len(x) == 2: - txt += chr(int(x, base=16)) - return create_string_object(b_(txt), forced_encoding) + if x != b"": + arr.append(int(x, base=16)) + return create_string_object(bytes(arr), forced_encoding) + + +__ESPACE_DICT__ = { + b"n": ord(b"\n"), + b"r": ord(b"\r"), + b"t": ord(b"\t"), + b"b": ord(b"\b"), + b"f": ord(b"\f"), + b"(": ord(b"("), + b")": ord(b")"), + b"/": ord(b"/"), + b"\\": ord(b"\\"), + b" ": ord(b" "), + b"%": ord(b"%"), + b"<": ord(b"<"), + b">": ord(b">"), + b"[": ord(b"["), + b"]": ord(b"]"), + b"#": ord(b"#"), + b"_": ord(b"_"), + b"&": ord(b"&"), + b"$": ord(b"$"), +} +__BACKSLASH_CODE__ = 92 def read_string_from_stream( @@ -54,30 +78,9 @@ def read_string_from_stream( break elif tok == b"\\": tok = stream.read(1) - escape_dict = { - b"n": b"\n", - b"r": b"\r", - b"t": b"\t", - b"b": b"\b", - b"f": b"\f", - b"c": rb"\c", - b"(": b"(", - b")": b")", - b"/": b"/", - b"\\": b"\\", - b" ": b" ", - b"%": b"%", - b"<": b"<", - b">": b">", - b"[": b"[", - b"]": b"]", - b"#": b"#", - b"_": b"_", - b"&": b"&", - b"$": b"$", - } try: - tok = escape_dict[tok] + txt.append(__ESPACE_DICT__[tok]) + continue except KeyError: if b"0" <= tok <= b"7": # "The number ddd may consist of one, two, or three @@ -85,6 +88,7 @@ def read_string_from_stream( # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) + sav = stream.tell() - 1 for _ in range(2): ntok = stream.read(1) if b"0" <= ntok <= b"7": @@ -92,7 +96,13 @@ def read_string_from_stream( else: stream.seek(-1, 1) # ntok has to be analyzed break - tok = b_(chr(int(tok, base=8))) + i = int(tok, base=8) + if i > 255: + txt.append(__BACKSLASH_CODE__) + stream.seek(sav) + else: + txt.append(i) + continue elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the @@ -102,12 +112,13 @@ def read_string_from_stream( stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: - tok = b"" + continue else: msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" logger_warning(msg, __name__) - txt.append(tok) - return create_string_object(b"".join(txt), forced_encoding) + txt.append(__BACKSLASH_CODE__) + txt.append(ord(tok)) + return create_string_object(bytes(txt), forced_encoding) def create_string_object( diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index 0e9633ac16c138eeaa90d3cf13e9f7cd6e2c006d..a53f28f0be432c38a1fff33672a2170eeb5f553f 100644 GIT binary patch delta 1135 zcmZuwJ5B>Z3>Bn7L8-XFHtGEtkL~mzB#PXGI^7lsK}(f3N1%gya3N-+5Rx~UZB|eA z_*(w%S~Mtv1*EGs-~I8J>sv{zoZ`BgRCDn2^;DWAA#+qhOsy z9_nuY2iNV#)9h}noP_mNYj;O2#kxAbfL+j&#EQMi$qgIN%*aCBkgS9v3tOTYrFns* zi?DXRqIyy{%_l0Z{g4pthD0$;#wiU+yHv)c!>vmue00SNBRm4^N)hR?Vv~jK{H4mw ziIFg?VZq^h6P&9s+GXO#zoHdee$2JJpxz|Hl_r{ zY(@&UHcToGu}TFAn%96`U{@PYmtn8OwFX4O-WCKhW)JHKw%|G;shLP7tsbKDqy5qZ aX;Lhg{J|DdMd<#DhbONh0xtWZ9J{UqF&k&I6)7fzU zDDp4D(!te>XPgE={aR6UiiW`%ZfqTeiIj@xjiPk$e@q9UWn5rLy=@eWjv@^6B7z!u zVL|uoObDgBOUxb4d9AVd3tS3uCsM>HdG}UKHtl4X<1DUS`o&!~`3?a}@xUY!MBL9r zrG8xWxg)aX>X2IbX&d=ezp1ic;>(#79o%KUaWwR|p2uxbk YRyD8Bjbb46Z8kjJvC(vTJUbbE19m{nQvd(} diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 9dcfb252d..69f2931f6 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -1,13 +1,19 @@ """Test the pypdf_cmap module.""" from io import BytesIO +from pathlib import Path import pytest -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from pypdf._cmap import build_char_map +from pypdf.generic import ArrayObject, NameObject, NullObject from . import get_data_from_url +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + @pytest.mark.enable_socket() @pytest.mark.slow() @@ -206,3 +212,13 @@ def test_eten_b5(): """Issue #2356""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") + + +def test_null_missing_width(): + """For coverage of 2792""" + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + page = writer.pages[0] + ft = page["/Resources"]["/Font"]["/F1"] + ft[NameObject("/Widths")] = ArrayObject() + ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject() + page.extract_text() diff --git a/tests/test_page.py b/tests/test_page.py index cb7b6c723..72df648e4 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1131,9 +1131,9 @@ def test_merge_page_resources_smoke_test(): # use these keys for some "operations", to validate renaming # (the operand name doesn't matter) contents1 = page1[NO("/Contents")] = ContentStream(None, None) - contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")] + contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")] contents2 = page2[NO("/Contents")] = ContentStream(None, None) - contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")] + contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")] expected_properties = { "/just1": "/just1-value", @@ -1438,3 +1438,12 @@ def test_negative_index(): src_abs = RESOURCE_ROOT / "git.pdf" reader = PdfReader(src_abs) assert reader.pages[0] == reader.pages[-1] + + +def test_get_contents_as_bytes(): + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + co = writer.pages[0]["/Contents"][0] + expected = co.get_data() + assert writer.pages[0]._get_contents_as_bytes() == expected + writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0] + assert writer.pages[0]._get_contents_as_bytes() == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index 856bedd86..a4ddff883 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -109,22 +109,6 @@ def test_mark_location(): Path("pypdf_pdfLocation.txt").unlink() # cleanup -@pytest.mark.parametrize( - ("input_str", "expected"), - [ - ("foo", b"foo"), - ("😀", "😀".encode()), - ("‰", "‰".encode()), - ("▷", "▷".encode()), - ("世", "世".encode()), - # A multi-character string example with non-latin-1 characters: - ("😀😃", "😀😃".encode()), - ], -) -def test_b(input_str: str, expected: bytes): - assert pypdf._utils.b_(input_str) == expected - - def test_deprecate_no_replacement(): with pytest.warns(DeprecationWarning) as warn: pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0") diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 93bc0c9e5..4407b8fd5 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -391,11 +391,11 @@ def test_merge(tmp_path, url, name): { "/Author": "Unknown", "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM", - "/Creator": "C:DEBÆł8", + "/Creator": r"C:\DEB\6338", "/Keywords": "", "/Producer": "Acrobat PDFWriter 3.02 for Windows", "/Subject": "", - "/Title": "C:DEBÆł8-6R.PDF", + "/Title": r"C:\DEB\6338-6R.PDF", }, ) ], diff --git a/tests/test_writer.py b/tests/test_writer.py index 9dfeffdd8..84d84d0db 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1290,7 +1290,7 @@ def test_attachments(): to_add = [ ("foobar.txt", b"foobarcontent"), ("foobar2.txt", b"foobarcontent2"), - ("foobar2.txt", b"2nd_foobarcontent"), + ("foobar2.txt", "2nd_foobarcontent"), ] for name, content in to_add: writer.add_attachment(name, content) From a9758ae1736adc51cc9bdc120b11a6d451a17e74 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:08:21 +0200 Subject: [PATCH 11/42] MAINT: Improve test coverage (#2796) --- tests/test_writer.py | 10 ++++++++++ tests/test_xmp.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 84d84d0db..eea1c6a48 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2188,6 +2188,10 @@ def test_replace_object(): reader._replace_object(reader.pages[0].indirect_reference, reader.pages[0]) pg = PageObject.create_blank_page(writer, 1000, 1000) reader._replace_object(reader.pages[0].indirect_reference, pg) + pg = PageObject.create_blank_page(None, 1000, 1000) + pg[NameObject("/Contents")] = writer.pages[0]["/Contents"] + writer._add_object(pg) + writer.add_page(pg) def test_mime_jupyter(): @@ -2300,3 +2304,9 @@ def test_matrix_entry_in_field_annots(): auto_regenerate=False, ) assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"] + + +def test_set_need_appearances_writer(): + """Minimal test for coverage""" + writer = PdfWriter() + writer.set_need_appearances_writer() diff --git a/tests/test_xmp.py b/tests/test_xmp.py index f864a9df9..6615b93c8 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -7,7 +7,7 @@ import pypdf.generic import pypdf.xmp -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError from . import get_data_from_url @@ -42,6 +42,35 @@ def test_read_xmp_metadata_samples(src): } +def test_writer_xmp_metadata_samples(): + writer = PdfWriter(SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf") + xmp = writer.xmp_metadata + assert xmp + assert xmp.dc_contributor == [] + assert xmp.dc_creator == ["John Doe"] + assert xmp.dc_source == "Martin Thoma" # attribute node + assert xmp.dc_description == {"x-default": "This is a text"} + assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)] + assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"} + assert xmp.custom_properties == { + "Style": "FooBarStyle", + "other": "worlds", + "⏰": "time", + } + co = pypdf.generic.ContentStream(None, None) + co.set_data( + xmp.stream.get_data().replace( + b'dc:source="Martin Thoma"', b'dc:source="Pubpub-Zz"' + ) + ) + writer.xmp_metadata = pypdf.xmp.XmpInformation(co) + b = BytesIO() + writer.write(b) + reader = PdfReader(b) + xmp2 = reader.xmp_metadata + assert xmp2.dc_source == "Pubpub-Zz" + + @pytest.mark.parametrize( ("src", "has_xmp"), [ From cf7fcfd568bb96bb2a3b978a0bd031a18e6d90b7 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:09:11 +0200 Subject: [PATCH 12/42] ENH: Compress PDF files merging identical objects (#2795) Add compress_identical_objects(). Discovered in #2728. Closes #2794. Closes #2768. --- docs/user/file-size.md | 20 +- pypdf/_text_extraction/_layout_mode/_font.py | 22 +- pypdf/_writer.py | 246 ++++++++++--------- pypdf/generic/_base.py | 3 + tests/test_writer.py | 23 ++ 5 files changed, 177 insertions(+), 137 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 0ee72e37e..d47ddcc0e 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -9,23 +9,17 @@ Some PDF documents contain the same object multiple times. For example, if an image appears three times in a PDF it could be embedded three times. Or it can be embedded once and referenced twice. -This can be done by reading and writing the file: +When adding data to a PdfWriter, the data is copied while respecting the original format. +For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object. -```python -from pypdf import PdfReader, PdfWriter - -reader = PdfReader("big-old-file.pdf") -writer = PdfWriter() +Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed). -for page in reader.pages: - writer.add_page(page) +In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)` -if reader.metadata is not None: - writer.add_metadata(reader.metadata) +* `remove_identicals` enables/disables compression merging identical objects. +* `remove_orphans` enables/disables suppression of unused objects. -with open("smaller-new-file.pdf", "wb") as fp: - writer.write(fp) -``` +It is recommended to apply this process just before writing to the file/stream. It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index 40655b1b2..1d9617d74 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -44,7 +44,7 @@ def __post_init__(self) -> None: self.font_dictionary["/DescendantFonts"] ): while isinstance(d_font, IndirectObject): - d_font = d_font.get_object() # type: ignore[assignment] + d_font = d_font.get_object() self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font ord_map = { ord(_target): _surrogate @@ -75,7 +75,11 @@ def __post_init__(self) -> None: { ord_map[_cidx]: _width for _cidx, _width in zip( - range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), + range( + cast(int, start_idx), + cast(int, start_idx) + len(width_list), + 1, + ), width_list, ) if _cidx in ord_map @@ -83,12 +87,20 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): - start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() + elif isinstance(w_next_entry, (int, float)) and isinstance( + _w[idx + 2].get_object(), (int, float) + ): + start_idx, stop_idx, const_width = ( + w_entry, + w_next_entry, + _w[idx + 2].get_object(), + ) self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) + for _cidx in range( + cast(int, start_idx), cast(int, stop_idx + 1), 1 + ) if _cidx in ord_map } ) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index d73c00e3d..a72e2a23d 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -27,20 +27,19 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import collections import decimal import enum import hashlib import re import uuid from io import BytesIO, FileIO, IOBase +from itertools import compress from pathlib import Path from types import TracebackType from typing import ( IO, Any, Callable, - Deque, Dict, Iterable, List, @@ -62,6 +61,7 @@ StrByteType, StreamType, _get_max_pdf_version_header, + deprecate, deprecate_with_replacement, logger_warning, ) @@ -156,12 +156,17 @@ def __init__( clone_from: Union[None, PdfReader, StrByteType, Path] = None, ) -> None: self._header = b"%PDF-1.3" - self._objects: List[PdfObject] = [] + self._objects: List[Optional[PdfObject]] = [] """The indirect objects in the PDF.""" - self._idnum_hash: Dict[bytes, IndirectObject] = {} - """Maps hash values of indirect objects to their IndirectObject instances.""" + """Maps hash values of indirect objects to the list of IndirectObjects. + This is used for compression. + """ + self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} + """List of already translated IDs. + dict[id(pdf)][(idnum, generation)] + """ self._id_translated: Dict[int, Dict[int, int]] = {} # The root of our page tree node. @@ -370,10 +375,13 @@ def get_object( indirect_reference: Union[int, IndirectObject], ) -> PdfObject: if isinstance(indirect_reference, int): - return self._objects[indirect_reference - 1] - if indirect_reference.pdf != self: + obj = self._objects[indirect_reference - 1] + elif indirect_reference.pdf != self: raise ValueError("pdf must be self") - return self._objects[indirect_reference.idnum - 1] + else: + obj = self._objects[indirect_reference.idnum - 1] + assert obj is not None # clarification for mypy + return obj def _replace_object( self, @@ -392,7 +400,9 @@ def _replace_object( obj = obj.clone(self) self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) - return self._objects[indirect_reference - 1] + + assert isinstance(obj, PdfObject) # clarification for mypy + return obj def _add_page( self, @@ -1242,14 +1252,13 @@ def write_stream(self, stream: StreamType) -> None: "It may not be written to correctly.", __name__, ) + # deprecated to be removed in pypdf 6.0.0 : + # if not self._root: + # self._root = self._add_object(self._root_object) + # self._sweep_indirect_references(self._root) - if not self._root: - self._root = self._add_object(self._root_object) - - self._sweep_indirect_references(self._root) - - object_positions = self._write_pdf_structure(stream) - xref_location = self._write_xref_table(stream, object_positions) + object_positions, free_objects = self._write_pdf_structure(stream) + xref_location = self._write_xref_table(stream, object_positions, free_objects) self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: @@ -1282,8 +1291,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: return my_file, stream - def _write_pdf_structure(self, stream: StreamType) -> List[int]: + def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: object_positions = [] + free_objects = [] # will contain list of all free entries stream.write(self.pdf_header.encode() + b"\n") stream.write(b"%\xE2\xE3\xCF\xD3\n") @@ -1296,15 +1306,26 @@ def _write_pdf_structure(self, stream: StreamType) -> List[int]: obj = self._encryption.encrypt_object(obj, idnum, 0) obj.write_to_stream(stream) stream.write(b"\nendobj\n") - return object_positions - - def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int: + else: + object_positions.append(-1) + free_objects.append(i + 1) + free_objects.append(0) # add 0 to loop in accordance with PDF spec + return object_positions, free_objects + + def _write_xref_table( + self, stream: StreamType, object_positions: List[int], free_objects: List[int] + ) -> int: xref_location = stream.tell() stream.write(b"xref\n") stream.write(f"0 {len(self._objects) + 1}\n".encode()) - stream.write(f"{0:0>10} {65535:0>5} f \n".encode()) + stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) + free_idx = 1 for offset in object_positions: - stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) + if offset > 0: + stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) + else: + stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) + free_idx += 1 return xref_location def _write_trailer(self, stream: StreamType, xref_location: int) -> None: @@ -1349,6 +1370,79 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: assert isinstance(self._info, DictionaryObject) self._info.update(args) + def compress_identical_objects( + self, + remove_identicals: bool = True, + remove_orphans: bool = True, + ) -> None: + """ + Parse the PDF file and merge objects that have same hash. + This will make objects common to multiple pages. + Recommended to be used just before writing output. + + Args: + remove_identicals: Remove identical objects. + remove_orphans: Remove unreferenced objects. + """ + + def replace_in_obj( + obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] + ) -> None: + if isinstance(obj, DictionaryObject): + key_val = obj.items() + elif isinstance(obj, ArrayObject): + key_val = enumerate(obj) # type: ignore + else: + return + assert isinstance(obj, (DictionaryObject, ArrayObject)) + for k, v in key_val: + if isinstance(v, IndirectObject): + orphans[v.idnum - 1] = False + if v in crossref: + obj[k] = crossref[v] + else: + """the filtering on DictionaryObject and ArrayObject only + will be performed within replace_in_obj""" + replace_in_obj(v, crossref) + + # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) + self._idnum_hash = {} + orphans = [True] * len(self._objects) + # look for similar objects + for idx, obj in enumerate(self._objects): + if obj is None: + continue + assert isinstance(obj.indirect_reference, IndirectObject) + h = obj.hash_value() + if remove_identicals and h in self._idnum_hash: + self._idnum_hash[h][1].append(obj.indirect_reference) + self._objects[idx] = None + else: + self._idnum_hash[h] = (obj.indirect_reference, []) + + # generate the dict converting others to 1st + cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} + cnv_rev: Dict[IndirectObject, IndirectObject] = {} + for k, v in cnv.items(): + cnv_rev.update(zip(v, (k,) * len(v))) + + # replace reference to merged objects + for obj in self._objects: + if isinstance(obj, (DictionaryObject, ArrayObject)): + replace_in_obj(obj, cnv_rev) + + # remove orphans (if applicable) + orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore + + orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore + + try: + orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore + except AttributeError: + pass + for i in compress(range(len(self._objects)), orphans): + self._objects[i] = None + def _sweep_indirect_references( self, root: Union[ @@ -1363,7 +1457,7 @@ def _sweep_indirect_references( TextStringObject, NullObject, ], - ) -> None: + ) -> None: # deprecated """ Resolving any circular references to Page objects. @@ -1379,73 +1473,13 @@ def _sweep_indirect_references( Args: root: The root of the PDF object tree to sweep. """ - stack: Deque[ - Tuple[ - Any, - Optional[Any], - Any, - List[PdfObject], - ] - ] = collections.deque() - discovered = [] - parent = None - grant_parents: List[PdfObject] = [] - key_or_id = None - - # Start from root - stack.append((root, parent, key_or_id, grant_parents)) - - while len(stack): - data, parent, key_or_id, grant_parents = stack.pop() - - # Build stack for a processing depth-first - if isinstance(data, (ArrayObject, DictionaryObject)): - for key, value in data.items(): - stack.append( - ( - value, - data, - key, - grant_parents + [parent] if parent is not None else [], - ) - ) - elif isinstance(data, IndirectObject) and data.pdf != self: - data = self._resolve_indirect_object(data) - - if str(data) not in discovered: - discovered.append(str(data)) - stack.append((data.get_object(), None, None, [])) - - # Check if data has a parent and if it is a dict or - # an array update the value - if isinstance(parent, (DictionaryObject, ArrayObject)): - if isinstance(data, StreamObject): - # a dictionary value is a stream; streams must be indirect - # objects, so we need to change this value. - data = self._resolve_indirect_object(self._add_object(data)) - - update_hashes = [] - - # Data changed and thus the hash value changed - if parent[key_or_id] != data: - update_hashes = [parent.hash_value()] + [ - grant_parent.hash_value() for grant_parent in grant_parents - ] - parent[key_or_id] = data - - # Update old hash value to new hash value - for old_hash in update_hashes: - indirect_reference = self._idnum_hash.pop(old_hash, None) - - if indirect_reference is not None: - indirect_reference_obj = indirect_reference.get_object() - - if indirect_reference_obj is not None: - self._idnum_hash[ - indirect_reference_obj.hash_value() - ] = indirect_reference + deprecate( + "_sweep_indirect_references has been removed, please report to dev team if this warning is observed", + ) - def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: + def _resolve_indirect_object( + self, data: IndirectObject + ) -> IndirectObject: # deprecated """ Resolves an indirect object to an indirect object in this PDF file. @@ -1470,36 +1504,10 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: Raises: ValueError: If the input stream is closed. """ - if hasattr(data.pdf, "stream") and data.pdf.stream.closed: - raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}") - - if data.pdf == self: - return data - - # Get real object indirect object - real_obj = data.pdf.get_object(data) - - if real_obj is None: - logger_warning( - f"Unable to resolve [{data.__class__.__name__}: {data}], " - "returning NullObject instead", - __name__, - ) - real_obj = NullObject() - - hash_value = real_obj.hash_value() - - # Check if object is handled - if hash_value in self._idnum_hash: - return self._idnum_hash[hash_value] - - if data.pdf == self: - self._idnum_hash[hash_value] = IndirectObject(data.idnum, 0, self) - # This is new object in this pdf - else: - self._idnum_hash[hash_value] = self._add_object(real_obj) - - return self._idnum_hash[hash_value] + deprecate( + "_resolve_indirect_object has been removed, please report to dev team if this warning is observed", + ) + return IndirectObject(0, 0, self) def get_reference(self, obj: PdfObject) -> IndirectObject: idnum = self._objects.index(obj) + 1 diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 309d389cc..9899cb48c 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -239,6 +239,9 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader self.generation = generation self.pdf = pdf + def __hash__(self) -> int: + return hash((self.idnum, self.generation, id(self.pdf))) + def clone( self, pdf_dest: PdfWriterProtocol, diff --git a/tests/test_writer.py b/tests/test_writer.py index eea1c6a48..49fe58538 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2306,6 +2306,29 @@ def test_matrix_entry_in_field_annots(): assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"] +@pytest.mark.enable_socket() +def test_compress_identical_objects(): + """Cf #2728 and #2794""" + url = "https://github.com/user-attachments/files/16575458/tt2.pdf" + name = "iss2794.pdf" + in_bytes = BytesIO(get_data_from_url(url, name=name)) + writer = PdfWriter(in_bytes) + writer.compress_identical_objects(remove_orphans=False) + out1 = BytesIO() + writer.write(out1) + assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) + writer.remove_page( + 1 + ) # page0 contains fields which keep reference to the deleted page + out2 = BytesIO() + writer.write(out2) + assert len(out1.getvalue()) - 100 < len(out2.getvalue()) + writer.compress_identical_objects(remove_identicals=False) + out3 = BytesIO() + writer.write(out3) + assert len(out2.getvalue()) > len(out3.getvalue()) + + def test_set_need_appearances_writer(): """Minimal test for coverage""" writer = PdfWriter() From 2eb565d914f1dea5c9024aa8fb5f2332dd36f7f2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:21:19 +0200 Subject: [PATCH 13/42] ROB: Fix extract_text() issues on damaged PDFs (#2760) Closes #2702. --- pypdf/_cmap.py | 2 +- tests/test_cmap.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index d635724d2..035850a4a 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -258,7 +258,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes: cm: bytes if isinstance(tu, StreamObject): cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() - elif isinstance(tu, str) and tu.startswith("/Identity"): + else: # if (tu is None) or cast(str, tu).startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" if isinstance(cm, str): diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 69f2931f6..9ec55723f 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -6,7 +6,7 @@ from pypdf import PdfReader, PdfWriter from pypdf._cmap import build_char_map -from pypdf.generic import ArrayObject, NameObject, NullObject +from pypdf.generic import ArrayObject, IndirectObject, NameObject, NullObject from . import get_data_from_url @@ -214,6 +214,22 @@ def test_eten_b5(): reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") +@pytest.mark.enable_socket() +def test_missing_entries_in_cmap(): + """ + Issue #2702: this issue is observed on damaged pdfs + use of this file in test has been discarded as too slow/long + we will create the same error from crazyones + """ + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + p = reader.pages[0] + p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject( + 99999999, 0, reader + ) + p.extract_text() + + def test_null_missing_width(): """For coverage of 2792""" writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") From d9a8c544e9dce3017ce6fc4acc2171bd580ccecf Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 14 Aug 2024 21:09:33 +0200 Subject: [PATCH 14/42] ENH: Report PdfReadError instead of RecursionError (#2800) Closes #2761. --- pypdf/_doc_common.py | 7 ++++++- pypdf/_reader.py | 5 ++++- tests/test_reader.py | 14 +++++++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index ffbdb7882..4f607340d 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -1121,7 +1121,12 @@ def _flatten( obj = page.get_object() if obj: # damaged file may have invalid child in /Pages - self._flatten(obj, inherit, **addt) + try: + self._flatten(obj, inherit, **addt) + except RecursionError: + raise PdfReadError( + "Maximum recursion depth reached during page flattening." + ) elif t == "/Page": for attr_in, value in list(inherit.items()): # if the page has it's own value, it does not inherit the diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 7c084107c..037f4e358 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -541,7 +541,10 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]: def cache_get_indirect_object( self, generation: int, idnum: int ) -> Optional[PdfObject]: - return self.resolved_objects.get((generation, idnum)) + try: + return self.resolved_objects.get((generation, idnum)) + except RecursionError: + raise PdfReadError("Maximum recursion depth reached.") def cache_indirect_object( self, generation: int, idnum: int, obj: Optional[PdfObject] diff --git a/tests/test_reader.py b/tests/test_reader.py index 0a2a32b81..c7dc39b30 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -115,7 +115,9 @@ def test_iss1943(): docinfo = reader.metadata docinfo.update( { - NameObject("/CreationDate"): TextStringObject("D:20230705005151Z00'00'"), + NameObject("/CreationDate"): TextStringObject( + "D:20230705005151Z00'00'" + ), NameObject("/ModDate"): TextStringObject("D:20230705005151Z00'00'"), } ) @@ -1577,3 +1579,13 @@ def test_context_manager_with_stream(): with PdfReader(pdf_stream) as reader: assert not reader.stream.closed assert not pdf_stream.closed + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(10) +def test_iss2761(): + url = "https://github.com/user-attachments/files/16312198/crash-b26d05712a29b241ac6f9dc7fff57428ba2d1a04.pdf" + name = "iss2761.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) + with pytest.raises(PdfReadError): + reader.pages[0].extract_text() From 799630daba40fe434406bd59083e8fe736178d1e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 15 Aug 2024 14:28:51 +0200 Subject: [PATCH 15/42] BUG: Fix sheared image (#2801) Closes #2411. --- pypdf/_xobj_image_helpers.py | 2 +- tests/test_images.py | 11 +++++++++++ tests/test_workflows.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 7a3f40d95..d870b1589 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -122,7 +122,7 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: by = 0 bit = 8 - bits for y in range(size[1]): - if (bit != 0) and (bit != 8 - bits): + if bit != 8 - bits: by += 1 bit = 8 - bits for x in range(size[0]): diff --git a/tests/test_images.py b/tests/test_images.py index 5955bf47c..5fd7d0968 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -462,3 +462,14 @@ def test_extract_image_from_object(caplog): co = reader.pages[0].get_contents() co.decode_as_image() assert "does not seem to be an Image" in caplog.text + + +@pytest.mark.enable_socket() +def test_4bits_images(caplog): + url = "https://github.com/user-attachments/files/16624406/tt.pdf" + name = "iss2411.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/user-attachments/assets/53058564-9a28-4e4a-818f-a6528013d7dc" + name = "iss2411.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[0].images[1].image, img) == 1.0 diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 4407b8fd5..1125222fc 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -412,7 +412,7 @@ def test_get_metadata(url, name, expected_metadata): ("url", "name", "strict", "exception"), [ ( - "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", + "https://github.com/user-attachments/files/16624503/tika-938702.pdf", "tika-938702.pdf", False, None, # iss #1090 is now fixed From 454a62a98cace9887cefa843bfb5d659e813cf8b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:59:29 +0200 Subject: [PATCH 16/42] MAINT: Fix mypy type output (#2799) Closes #2798. --- docs/modules/PageObject.rst | 8 +- pypdf/_page.py | 272 +++++++++++++++++++++++------------- pypdf/_utils.py | 79 ++--------- tests/test_workflows.py | 15 +- 4 files changed, 203 insertions(+), 171 deletions(-) diff --git a/docs/modules/PageObject.rst b/docs/modules/PageObject.rst index 45e81b6ab..b4524b443 100644 --- a/docs/modules/PageObject.rst +++ b/docs/modules/PageObject.rst @@ -6,14 +6,12 @@ The PageObject Class :undoc-members: :show-inheritance: -.. autoclass:: pypdf._utils.ImageFile +.. autoclass:: pypdf._page.VirtualListImages :members: :undoc-members: :show-inheritance: - :exclude-members: IndirectObject -.. autoclass:: pypdf._utils.File +.. autoclass:: pypdf._page.ImageFile :members: + :inherited-members: File :undoc-members: - :show-inheritance: - :exclude-members: IndirectObject diff --git a/pypdf/_page.py b/pypdf/_page.py index 48cdeb149..c51aee1ab 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,9 @@ # POSSIBILITY OF SUCH DAMAGE. import math +from dataclasses import dataclass from decimal import Decimal +from io import BytesIO from pathlib import Path from typing import ( Any, @@ -58,9 +60,8 @@ ) from ._utils import ( CompressedTransformationMatrix, - File, - ImageFile, TransformationMatrixType, + _human_readable_bytes, logger_warning, matrix_multiply, ) @@ -85,6 +86,14 @@ StreamObject, ) +try: + from PIL.Image import Image + + pil_not_imported = False +except ImportError: + Image = object # type: ignore + pil_not_imported = True # error will be raised only when using images + MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' @@ -301,6 +310,160 @@ def apply_on( return list(pt1) if isinstance(pt, list) else pt1 +@dataclass +class ImageFile: + """ + Image within the PDF file. *This object is not designed to be built.* + + This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. + """ + + name: str = "" + """ + Filename as identified within the PDF file. + """ + + data: bytes = b"" + """ + Data as bytes. + """ + + image: Optional[Image] = None + """ + Data as PIL image. + """ + + indirect_reference: Optional[IndirectObject] = None + """ + Reference to the object storing the stream. + """ + + def replace(self, new_image: Image, **kwargs: Any) -> None: + """ + Replace the image with a new PIL image. + + Args: + new_image (PIL.Image.Image): The new PIL image to replace the existing image. + **kwargs: Additional keyword arguments to pass to `Image.save()`. + + Raises: + TypeError: If the image is inline or in a PdfReader. + TypeError: If the image does not belong to a PdfWriter. + TypeError: If `new_image` is not a PIL Image. + + Note: + This method replaces the existing image with a new image. + It is not allowed for inline images or images within a PdfReader. + The `kwargs` parameter allows passing additional parameters + to `Image.save()`, such as quality. + """ + if pil_not_imported: + raise ImportError( + "pillow is required to do image extraction. " + "It can be installed via 'pip install pypdf[image]'" + ) + + from ._reader import PdfReader + + # to prevent circular import + from .filters import _xobj_to_image + from .generic import DictionaryObject, PdfObject + + if self.indirect_reference is None: + raise TypeError("Cannot update an inline image.") + if not hasattr(self.indirect_reference.pdf, "_id_translated"): + raise TypeError("Cannot update an image not belonging to a PdfWriter.") + if not isinstance(new_image, Image): + raise TypeError("new_image shall be a PIL Image") + b = BytesIO() + new_image.save(b, "PDF", **kwargs) + reader = PdfReader(b) + assert reader.pages[0].images[0].indirect_reference is not None + self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( + reader.pages[0].images[0].indirect_reference.get_object() + ) + cast( + PdfObject, self.indirect_reference.get_object() + ).indirect_reference = self.indirect_reference + # change the object attributes + extension, byte_stream, img = _xobj_to_image( + cast(DictionaryObject, self.indirect_reference.get_object()) + ) + assert extension is not None + self.name = self.name[: self.name.rfind(".")] + extension + self.data = byte_stream + self.image = img + + def __str__(self) -> str: + return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" + + def __repr__(self) -> str: + return self.__str__()[:-1] + f", hash: {hash(self.data)})" + + +class VirtualListImages(Sequence[ImageFile]): + """ + Provides access to images referenced within a page. + Only one copy will be returned if the usage is used on the same page multiple times. + See :func:`PageObject.images` for more details. + """ + + def __init__( + self, + ids_function: Callable[[], List[Union[str, List[str]]]], + get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], + ) -> None: + self.ids_function = ids_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return len(self.ids_function()) + + def keys(self) -> List[Union[str, List[str]]]: + return self.ids_function() + + def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: + return [(x, self[x]) for x in self.ids_function()] + + @overload + def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: + ... + + @overload + def __getitem__(self, index: slice) -> Sequence[ImageFile]: + ... + + def __getitem__( + self, index: Union[int, slice, str, List[str], Tuple[str]] + ) -> Union[ImageFile, Sequence[ImageFile]]: + lst = self.ids_function() + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + lst = [lst[x] for x in indices] + cls = type(self) + return cls((lambda: lst), self.get_function) + if isinstance(index, (str, list, tuple)): + return self.get_function(index) + if not isinstance(index, int): + raise TypeError("invalid sequence indices type") + len_self = len(lst) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(lst[index]) + + def __iter__(self) -> Iterator[ImageFile]: + for i in range(len(self)): + yield self[i] + + def __str__(self) -> str: + p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] + return f"[{', '.join(p)}]" + + class PageObject(DictionaryObject): """ PageObject represents a single page within a PDF file. @@ -391,33 +554,6 @@ def create_blank_page( return page - @property - def _old_images(self) -> List[File]: # deprecated - """ - Get a list of all images of the page. - - This requires pillow. You can install it via 'pip install pypdf[image]'. - - For the moment, this does NOT include inline images. They will be added - in future. - """ - images_extracted: List[File] = [] - if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore - return images_extracted - - x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream, img = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = f"{obj[1:]}{extension}" - images_extracted.append(File(name=filename, data=byte_stream)) - images_extracted[-1].image = img - images_extracted[-1].indirect_reference = x_object[ - obj - ].indirect_reference - return images_extracted - def _get_ids_image( self, obj: Optional[DictionaryObject] = None, @@ -495,7 +631,7 @@ def _get_image( return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) @property - def images(self) -> List[ImageFile]: + def images(self) -> VirtualListImages: """ Read-only property emulating a list of images on a page. @@ -505,20 +641,19 @@ def images(self) -> List[ImageFile]: - An integer Examples: - reader.pages[0].images[0] # return fist image - reader.pages[0].images['/I0'] # return image '/I0' - # return image '/Image1' within '/TP1' Xobject/Form: - reader.pages[0].images['/TP1','/Image1'] - for img in reader.pages[0].images: # loop within all objects + * `reader.pages[0].images[0]` # return fist image + * `reader.pages[0].images['/I0']` # return image '/I0' + * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' Xobject/Form + * `for img in reader.pages[0].images:` # loops through all objects images.keys() and images.items() can be used. The ImageFile has the following properties: - `.name` : name of the object - `.data` : bytes of the object - `.image` : PIL Image Object - `.indirect_reference` : object reference + * `.name` : name of the object + * `.data` : bytes of the object + * `.image` : PIL Image Object + * `.indirect_reference` : object reference and the following methods: `.replace(new_image: PIL.Image.Image, **kwargs)` : @@ -532,7 +667,7 @@ def images(self) -> List[ImageFile]: Inline images are extracted and named ~0~, ~1~, ..., with the indirect_reference set to None. """ - return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + return VirtualListImages(self._get_ids_image, self._get_image) def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: """Translate values used in inline image""" @@ -2393,60 +2528,3 @@ def process_font(f: DictionaryObject) -> None: for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) return fnt, emb # return the sets for each page - - -class _VirtualListImages(Sequence[ImageFile]): - def __init__( - self, - ids_function: Callable[[], List[Union[str, List[str]]]], - get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], - ) -> None: - self.ids_function = ids_function - self.get_function = get_function - self.current = -1 - - def __len__(self) -> int: - return len(self.ids_function()) - - def keys(self) -> List[Union[str, List[str]]]: - return self.ids_function() - - def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: - return [(x, self[x]) for x in self.ids_function()] - - @overload - def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: - ... - - @overload - def __getitem__(self, index: slice) -> Sequence[ImageFile]: - ... - - def __getitem__( - self, index: Union[int, slice, str, List[str], Tuple[str]] - ) -> Union[ImageFile, Sequence[ImageFile]]: - lst = self.ids_function() - if isinstance(index, slice): - indices = range(*index.indices(len(self))) - lst = [lst[x] for x in indices] - cls = type(self) - return cls((lambda: lst), self.get_function) - if isinstance(index, (str, list, tuple)): - return self.get_function(index) - if not isinstance(index, int): - raise TypeError("invalid sequence indices type") - len_self = len(lst) - if index < 0: - # support negative indexes - index = len_self + index - if index < 0 or index >= len_self: - raise IndexError("sequence index out of range") - return self.get_function(lst[index]) - - def __iter__(self) -> Iterator[ImageFile]: - for i in range(len(self)): - yield self[i] - - def __str__(self) -> str: - p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] - return f"[{', '.join(p)}]" diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 94d45cf6d..e0034ccc4 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -36,7 +36,7 @@ import warnings from dataclasses import dataclass from datetime import datetime, timezone -from io import DEFAULT_BUFFER_SIZE, BytesIO +from io import DEFAULT_BUFFER_SIZE from os import SEEK_CUR from typing import ( IO, @@ -47,7 +47,6 @@ Pattern, Tuple, Union, - cast, overload, ) @@ -525,10 +524,18 @@ def getter(self, method): # type: ignore # noqa: ANN001, ANN202 class File: from .generic import IndirectObject - name: str - data: bytes - image: Optional[Any] = None # optional ; direct image access - indirect_reference: Optional[IndirectObject] = None # optional ; link to PdfObject + name: str = "" + """ + Filename as identified within the PDF file. + """ + data: bytes = b"" + """ + Data as bytes. + """ + indirect_reference: Optional[IndirectObject] = None + """ + Reference to the object storing the stream. + """ def __str__(self) -> str: return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" @@ -537,66 +544,6 @@ def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" -@dataclass -class ImageFile(File): - from .generic import IndirectObject - - image: Optional[Any] = None # optional ; direct PIL image access - indirect_reference: Optional[IndirectObject] = None # optional ; link to PdfObject - - def replace(self, new_image: Any, **kwargs: Any) -> None: - """ - Replace the Image with a new PIL image. - - Args: - new_image (PIL.Image.Image): The new PIL image to replace the existing image. - **kwargs: Additional keyword arguments to pass to `Image.Image.save()`. - - Raises: - TypeError: If the image is inline or in a PdfReader. - TypeError: If the image does not belong to a PdfWriter. - TypeError: If `new_image` is not a PIL Image. - - Note: - This method replaces the existing image with a new image. - It is not allowed for inline images or images within a PdfReader. - The `kwargs` parameter allows passing additional parameters - to `Image.Image.save()`, such as quality. - """ - from PIL import Image - - from ._reader import PdfReader - - # to prevent circular import - from .filters import _xobj_to_image - from .generic import DictionaryObject, PdfObject - - if self.indirect_reference is None: - raise TypeError("Can not update an inline image") - if not hasattr(self.indirect_reference.pdf, "_id_translated"): - raise TypeError("Can not update an image not belonging to a PdfWriter") - if not isinstance(new_image, Image.Image): - raise TypeError("new_image shall be a PIL Image") - b = BytesIO() - new_image.save(b, "PDF", **kwargs) - reader = PdfReader(b) - assert reader.pages[0].images[0].indirect_reference is not None - self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( - reader.pages[0].images[0].indirect_reference.get_object() - ) - cast( - PdfObject, self.indirect_reference.get_object() - ).indirect_reference = self.indirect_reference - # change the object attributes - extension, byte_stream, img = _xobj_to_image( - cast(DictionaryObject, self.indirect_reference.get_object()) - ) - assert extension is not None - self.name = self.name[: self.name.rfind(".")] + extension - self.data = byte_stream - self.image = img - - @functools.total_ordering class Version: COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$") diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 1125222fc..f01269893 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -980,7 +980,7 @@ def test_replace_image(tmp_path): # extra tests for coverage with pytest.raises(TypeError) as exc: reader.pages[0].images[0].replace(img) - assert exc.value.args[0] == "Can not update an image not belonging to a PdfWriter" + assert exc.value.args[0] == "Cannot update an image not belonging to a PdfWriter." i = writer.pages[0].images[0] with pytest.raises(TypeError) as exc: i.replace(reader.pages[0].images[0]) # missing .image @@ -988,7 +988,16 @@ def test_replace_image(tmp_path): i.indirect_reference = None # to behave like an inline image with pytest.raises(TypeError) as exc: i.replace(reader.pages[0].images[0].image) - assert exc.value.args[0] == "Can not update an inline image" + assert exc.value.args[0] == "Cannot update an inline image." + + import pypdf + + try: + pypdf._page.pil_not_imported = True + with pytest.raises(ImportError) as exc: + i.replace(reader.pages[0].images[0].image) + finally: + pypdf._page.pil_not_imported = False @pytest.mark.enable_socket() @@ -1015,7 +1024,7 @@ def test_inline_images(): with pytest.raises(TypeError) as exc: reader.pages[0].images[0].replace(img_ref) - assert exc.value.args[0] == "Can not update an inline image" + assert exc.value.args[0] == "Cannot update an inline image." _a = {} for x, y in reader.pages[2].images[0:-2].items(): From 0c81f3cfad26ddffbfc60d0ae855118e515fad8c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:52:19 +0200 Subject: [PATCH 17/42] ENH: Accept utf strings for metadata (#2802) Closes #2754. --- pypdf/generic/_base.py | 25 +++++++++++++++++++++++-- pypdf/generic/_utils.py | 38 ++++++++++++++++++++++++++++---------- tests/test_generic.py | 13 +++++++++++++ tests/test_writer.py | 21 +++++++++++++++++++++ 4 files changed, 85 insertions(+), 12 deletions(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 9899cb48c..f48dc66c3 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -517,23 +517,38 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000 autodetect_pdfdocencoding: bool autodetect_utf16: bool utf16_bom: bytes + _original_bytes: Optional[bytes] = None def __new__(cls, value: Any) -> "TextStringObject": + org = None if isinstance(value, bytes): + org = value value = value.decode("charmap") o = str.__new__(cls, value) + o._original_bytes = org o.autodetect_utf16 = False o.autodetect_pdfdocencoding = False o.utf16_bom = b"" if value.startswith(("\xfe\xff", "\xff\xfe")): + assert org is not None # for mypy + try: + o = str.__new__(cls, org.decode("utf-16")) + except UnicodeDecodeError as exc: + logger_warning( + f"{exc!s}\ninitial string:{exc.object!r}", + __name__, + ) + o = str.__new__(cls, exc.object[: exc.start].decode("utf-16")) + o._original_bytes = org o.autodetect_utf16 = True - o.utf16_bom = value[:2].encode("charmap") + o.utf16_bom = org[:2] else: try: encode_pdfdocencoding(o) o.autodetect_pdfdocencoding = True except UnicodeEncodeError: o.autodetect_utf16 = True + o.utf16_bom = codecs.BOM_UTF16_BE return o def clone( @@ -544,6 +559,7 @@ def clone( ) -> "TextStringObject": """Clone object into pdf_dest.""" obj = TextStringObject(self) + obj._original_bytes = self._original_bytes obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 obj.utf16_bom = self.utf16_bom @@ -559,7 +575,10 @@ def original_bytes(self) -> bytes: if that occurs, this "original_bytes" property can be used to back-calculate what the original encoded bytes were. """ - return self.get_original_bytes() + if self._original_bytes is not None: + return self._original_bytes + else: + return self.get_original_bytes() def get_original_bytes(self) -> bytes: # We're a text string object, but the library is trying to get our raw @@ -584,6 +603,8 @@ def get_encoded_bytes(self) -> bytes: # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: + if self._original_bytes is not None: + return self._original_bytes if self.autodetect_utf16: raise UnicodeEncodeError("", "forced", -1, -1, "") bytearr = encode_pdfdocencoding(self) diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index b5ac6632a..6fce6d0b2 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -148,27 +148,45 @@ def create_string_object( out += forced_encoding[x] except Exception: out += bytes((x,)).decode("charmap") - return TextStringObject(out) + obj = TextStringObject(out) + obj._original_bytes = string + return obj elif isinstance(forced_encoding, str): if forced_encoding == "bytes": return ByteStringObject(string) - return TextStringObject(string.decode(forced_encoding)) + obj = TextStringObject(string.decode(forced_encoding)) + obj._original_bytes = string + return obj else: try: if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) + retval._original_bytes = string retval.autodetect_utf16 = True retval.utf16_bom = string[:2] return retval - else: - # This is probably a big performance hit here, but we need - # to convert string objects into the text/unicode-aware - # version if possible... and the only way to check if that's - # possible is to try. - # Some strings are strings, some are just byte arrays. - retval = TextStringObject(decode_pdfdocencoding(string)) - retval.autodetect_pdfdocencoding = True + if string.startswith(b"\x00"): + retval = TextStringObject(string.decode("utf-16be")) + retval._original_bytes = string + retval.autodetect_utf16 = True + retval.utf16_bom = codecs.BOM_UTF16_BE return retval + if string[1:2] == b"\x00": + retval = TextStringObject(string.decode("utf-16le")) + retval._original_bytes = string + retval.autodetect_utf16 = True + retval.utf16_bom = codecs.BOM_UTF16_LE + return retval + + # This is probably a big performance hit here, but we need + # to convert string objects into the text/unicode-aware + # version if possible... and the only way to check if that's + # possible is to try. + # Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval._original_bytes = string + retval.autodetect_pdfdocencoding = True + return retval except UnicodeDecodeError: return ByteStringObject(string) else: diff --git a/tests/test_generic.py b/tests/test_generic.py index b1079974e..6b8ae0151 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -494,6 +494,9 @@ def test_textstringobject_autodetect_utf16(): tso.autodetect_utf16 = True tso.utf16_bom = codecs.BOM_UTF16_BE assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" + tso.utf16_bom = codecs.BOM_UTF16_LE + assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00" + assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00" def test_remove_child_not_in_tree(): @@ -1131,6 +1134,16 @@ def test_create_string_object_utf16_bom(): result.get_encoded_bytes() == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) + result = TextStringObject( + b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" + ) + assert result == "PaperPort 14\x00" + assert result.autodetect_utf16 is True + assert result.utf16_bom == b"\xff\xfe" + assert ( + result.get_encoded_bytes() + == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" + ) # utf16-be without bom result = TextStringObject("ÿ") diff --git a/tests/test_writer.py b/tests/test_writer.py index 49fe58538..b6a47a18c 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2333,3 +2333,24 @@ def test_set_need_appearances_writer(): """Minimal test for coverage""" writer = PdfWriter() writer.set_need_appearances_writer() + + +def test_utf16_metadata(): + """See #2754""" + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + writer.add_metadata( + { + "/Subject": "Invoice №AI_047", + } + ) + b = BytesIO() + writer.write(b) + b.seek(0) + reader = PdfReader(b) + assert reader.metadata.subject == "Invoice №AI_047" + bb = b.getvalue() + i = bb.find(b"/Subject") + assert bb[i : i + 100] == ( + b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e" + b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)" + ) From d2d520b47264c4f43b79e038d9ac78a2b583f269 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 22 Aug 2024 05:37:02 +0200 Subject: [PATCH 18/42] MAINT: Remove unused code (#2805) --- pypdf/_reader.py | 21 ++++----------------- tests/test_reader.py | 2 +- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 037f4e358..1ffcd436d 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -33,6 +33,7 @@ from pathlib import Path from types import TracebackType from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -47,7 +48,6 @@ from ._doc_common import PdfDocCommon, convert_to_int from ._encryption import Encryption, PasswordType -from ._page import PageObject from ._utils import ( StrByteType, StreamType, @@ -82,6 +82,9 @@ ) from .xmp import XmpInformation +if TYPE_CHECKING: + from ._page import PageObject + class PdfReader(PdfDocCommon): """ @@ -273,22 +276,6 @@ def xmp_metadata(self) -> Optional[XmpInformation]: finally: self._override_encryption = False - def _get_page(self, page_number: int) -> PageObject: - """ - Retrieve a page by number from this PDF file. - - Args: - page_number: The page number to retrieve - (pages begin at zero) - - Returns: - A :class:`PageObject` instance. - """ - if self.flattened_pages is None: - self._flatten() - assert self.flattened_pages is not None, "hint for mypy" - return self.flattened_pages[page_number] - def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> Optional[int]: diff --git a/tests/test_reader.py b/tests/test_reader.py index c7dc39b30..0413a9135 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -434,7 +434,7 @@ def test_get_form(src, expected, expected_get_fields, txt_file_path): def test_get_page_number(src, page_number): src = RESOURCE_ROOT / src reader = PdfReader(src) - reader._get_page(0) + reader.get_page(0) page = reader.pages[page_number] assert reader.get_page_number(page) == page_number From 9f08cd0e48114b5788e9c219b443bf75dcdbe251 Mon Sep 17 00:00:00 2001 From: Bertrand Bordage Date: Fri, 23 Aug 2024 07:43:06 +0200 Subject: [PATCH 19/42] ROB: Raise PdfReadError when missing /Root in trailer (#2808) Fixes #2806. --- pypdf/_reader.py | 5 ++++- tests/test_reader.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 1ffcd436d..1452661a5 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -190,7 +190,10 @@ def close(self) -> None: @property def root_object(self) -> DictionaryObject: """Provide access to "/Root". Standardized with PdfWriter.""" - return cast(DictionaryObject, self.trailer[TK.ROOT].get_object()) + root = self.trailer[TK.ROOT] + if root is None: + raise PdfReadError('Cannot find "/Root" key in trailer') + return cast(DictionaryObject, root.get_object()) @property def _info(self) -> Optional[DictionaryObject]: diff --git a/tests/test_reader.py b/tests/test_reader.py index 0413a9135..d2394f95d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -607,9 +607,9 @@ def test_read_unknown_zero_pages(caplog): "startxref on same line as offset", ] assert normalize_warnings(caplog.text) == warnings - with pytest.raises(AttributeError) as exc: + with pytest.raises(PdfReadError) as exc: len(reader.pages) - assert exc.value.args[0] == "'NoneType' object has no attribute 'get_object'" + assert exc.value.args[0] == 'Cannot find "/Root" key in trailer' def test_read_encrypted_without_decryption(): From b7b3c8cedfc94d1b65fe2cd15741209b532e45c8 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 23 Aug 2024 08:59:51 +0200 Subject: [PATCH 20/42] MAINT: Improve wording of set_data error message (#2810) --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 399836be5..2c6e20e57 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1021,7 +1021,7 @@ def set_data(self, data: bytes) -> None: # deprecated super().set_data(FlateDecode.encode(data)) else: raise PdfReadError( - "Streams encoded with different filter from only FlateDecode is not supported" + "Streams encoded with a filter different from FlateDecode are not supported" ) From f55d33274575789c16c04ce02b75d77c727db2f7 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:50:20 +0200 Subject: [PATCH 21/42] ENH: Robustify on missing font for Tf operator in text_extract() (#2816) Closes #2815. --- pypdf/_page.py | 2 +- tests/test_workflows.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index c51aee1ab..17ec04477 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1882,7 +1882,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: cmap = ( unknown_char_map[2], unknown_char_map[3], - "???" + operands[0], + f"???{operands[0]}", None, ) try: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f01269893..f307271e7 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1298,3 +1298,12 @@ def test_extract_empty_page(): name = "iss2533.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name))) assert reader.pages[1].extract_text(extraction_mode="layout") == "" + + +@pytest.mark.enable_socket() +def test_iss2815(): + """Cf #2815""" + url = "https://github.com/user-attachments/files/16760725/crash-c1920c7a064649e1191d7879952ec252473fc7e6.pdf" + name = "iss2815.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name))) + assert reader.pages[0].extract_text() == "test command with wrong number of args" From 38ea8c5598db08b573f451cae456fa55adf6fbe0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Aug 2024 07:17:43 +0200 Subject: [PATCH 22/42] ENH: Add UniGB-UTF16 encodings (#2819) Closes #2812. --- pypdf/_cmap.py | 2 ++ tests/test_cmap.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 035850a4a..6c5996703 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -126,6 +126,8 @@ def build_char_map_from_dict( "/ETenms-B5-V": "cp950", "/UniCNS-UTF16-H": "utf-16-be", "/UniCNS-UTF16-V": "utf-16-be", + "/UniGB-UTF16-H": "gb18030", + "/UniGB-UTF16-V": "gb18030", # UCS2 in code } diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 9ec55723f..8042d306e 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -214,7 +214,6 @@ def test_eten_b5(): reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") -@pytest.mark.enable_socket() def test_missing_entries_in_cmap(): """ Issue #2702: this issue is observed on damaged pdfs @@ -231,10 +230,21 @@ def test_missing_entries_in_cmap(): def test_null_missing_width(): - """For coverage of 2792""" + """For coverage of #2792""" writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") page = writer.pages[0] ft = page["/Resources"]["/Font"]["/F1"] ft[NameObject("/Widths")] = ArrayObject() ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject() page.extract_text() + + +@pytest.mark.enable_socket() +def test_unigb_utf16(): + """Cf #2812""" + url = ( + "https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf" + ) + name = "iss2812.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text() From 82eac7e316f8f785d00ed600f8ba4aba3296a4a8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 29 Aug 2024 18:29:23 +0200 Subject: [PATCH 23/42] ROB: Robustify .set_data() (#2821) Cope with objects where the filter is ["/FlateDecode"] and/or where data has not been read yet. --- pypdf/generic/_data_structures.py | 6 ++++-- tests/test_generic.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2c6e20e57..9ddd28d66 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1013,10 +1013,12 @@ def get_data(self) -> bytes: def set_data(self, data: bytes) -> None: # deprecated from ..filters import FlateDecode - if self.get(SA.FILTER, "") == FT.FLATE_DECODE: + if self.get(SA.FILTER, "") in (FT.FLATE_DECODE, [FT.FLATE_DECODE]): if not isinstance(data, bytes): raise TypeError("data must be bytes") - assert self.decoded_self is not None + if self.decoded_self is None: + self.get_data() # to create self.decoded_self + assert self.decoded_self is not None, "mypy" self.decoded_self.set_data(data) super().set_data(FlateDecode.encode(data)) else: diff --git a/tests/test_generic.py b/tests/test_generic.py index 6b8ae0151..c14e249fe 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1326,6 +1326,22 @@ def test_encodedstream_set_data(): assert cc[NameObject("/Test")] == "/MyTest" +@pytest.mark.enable_socket() +def test_set_data_2(): + """ + Modify a stream not yet loaded and + where the filter is ["/FlateDecode"] + """ + url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf" + name = "iss2780.pdf" + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) + writer.root_object["/AcroForm"]["/XFA"][7].set_data(b"test") + assert writer.root_object["/AcroForm"]["/XFA"][7].get_object()["/Filter"] == [ + "/FlateDecode" + ] + assert writer.root_object["/AcroForm"]["/XFA"][7].get_object().get_data() == b"test" + + @pytest.mark.enable_socket() def test_calling_indirect_objects(): """Cope with cases where attributes/items are called from indirectObject""" From e694d5571ced01b10d0e982ee97d704218b5f991 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 5 Sep 2024 21:32:20 +0200 Subject: [PATCH 24/42] DEV: Fix coverage uploads (#2832) * DEV: Fix coverage uploads Starting 2024-09-02, hidden files are ignored by default: https://redirect.github.com/actions/upload-artifact/issues/602 * list files * no need to list files --- .github/workflows/github-ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index d5d9bb4d4..6cf28f394 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -135,6 +135,7 @@ jobs: name: coverage-data.${{ matrix.python-version }}-${{ matrix.use-crypto-lib }} path: .coverage.* if-no-files-found: ignore + include-hidden-files: true codestyle: name: Check code style issues From b85c171a387a3ccfd52bedb7e79d22f9b98c6f9a Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Fri, 6 Sep 2024 11:06:07 +0100 Subject: [PATCH 25/42] DOC: Small changes to PaperSize notes (#2834) Plus one typo in xmp.py. --- pypdf/papersizes.py | 13 +++++++------ pypdf/xmp.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pypdf/papersizes.py b/pypdf/papersizes.py index 2d83e1d5d..ed09f341f 100644 --- a/pypdf/papersizes.py +++ b/pypdf/papersizes.py @@ -11,16 +11,17 @@ class Dimensions(NamedTuple): class PaperSize: """(width, height) of the paper in portrait mode in pixels at 72 ppi.""" - # Notes how to calculate it: - # 1. Get the size of the paper in mm - # 2. Convert it to inches (25.4 millimeters are equal to 1 inches) - # 3. Convert it to pixels ad 72dpi (1 inch is equal to 72 pixels) + # Notes of how to calculate it: + # 1. Get the size of the paper in millimeters + # 2. Convert it to inches (25.4 millimeters is equal to 1 inch) + # 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels) # All Din-A paper sizes follow this pattern: - # 2xA(n-1) = A(n) + # 2 x A(n - 1) = A(n) # So the height of the next bigger one is the width of the smaller one - # The ratio is always approximately the ratio 1:2**0.5 + # The ratio is always approximately 1:2**0.5 # Additionally, A0 is defined to have an area of 1 m**2 + # https://en.wikipedia.org/wiki/ISO_216 # Be aware of rounding issues! A0 = Dimensions(2384, 3370) # 841mm x 1189mm A1 = Dimensions(1684, 2384) diff --git a/pypdf/xmp.py b/pypdf/xmp.py index df55c9053..0c4444fc1 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -306,7 +306,7 @@ def _get_text(self, element: XmlElement) -> str: """Unique identifier of the work from which this resource was derived.""" dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) - """An unordered array of descriptive phrases or keywrods that specify the + """An unordered array of descriptive phrases or keywords that specify the topic of the content of the resource.""" dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) From 98d4425122e6432f32a87c2bb33f3290ff517eff Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:02:10 +0200 Subject: [PATCH 26/42] ENH: Add incremental capability to PdfWriter (#2811) Closes #2780. --- pypdf/_doc_common.py | 66 +++- pypdf/_page.py | 40 ++- pypdf/_protocols.py | 3 + pypdf/_reader.py | 2 + pypdf/_writer.py | 323 ++++++++++++++---- pypdf/constants.py | 5 +- pypdf/generic/_base.py | 99 +++++- pypdf/generic/_data_structures.py | 33 +- ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217093 -> 217093 bytes tests/test_generic.py | 6 + tests/test_page.py | 39 ++- tests/test_reader.py | 2 +- tests/test_writer.py | 88 +++++ 13 files changed, 613 insertions(+), 93 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 4f607340d..8d07098b4 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -65,9 +65,7 @@ from .constants import FieldDictionaryAttributes as FA from .constants import PageAttributes as PG from .constants import PagesAttributes as PA -from .errors import ( - PdfReadError, -) +from .errors import PdfReadError, PyPdfError from .generic import ( ArrayObject, BooleanObject, @@ -254,6 +252,8 @@ class PdfDocCommon: _encryption: Optional[Encryption] = None + _readonly: bool = False + @property @abstractmethod def root_object(self) -> DictionaryObject: @@ -349,7 +349,7 @@ def get_num_pages(self) -> int: return self.root_object["/Pages"]["/Count"] # type: ignore else: if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None return len(self.flattened_pages) @@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject: A :class:`PageObject` instance. """ if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None, "hint for mypy" return self.flattened_pages[page_number] + def _get_page_in_node( + self, + page_number: int, + ) -> Tuple[DictionaryObject, int]: + """ + Retrieve the node and position within the /Kids containing the page. + If page_number is greater than the number of pages, it returns the top node, -1. + """ + top = cast(DictionaryObject, self.root_object["/Pages"]) + + def recursive_call( + node: DictionaryObject, mi: int + ) -> Tuple[Optional[PdfObject], int]: + ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types + if node["/Type"] == "/Page": + if page_number == mi: + return node, -1 + # else + return None, mi + 1 + if (page_number - mi) >= ma: # not in nodes below + if node == top: + return top, -1 + # else + return None, mi + ma + for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): + kid = cast(DictionaryObject, kid.get_object()) + n, i = recursive_call(kid, mi) + if n is not None: # page has just been found ... + if i < 0: # ... just below! + return node, idx + # else: # ... at lower levels + return n, i + mi = i + raise PyPdfError("Unexpectedly cannot find the node.") + + node, idx = recursive_call(top, 0) + assert isinstance(node, DictionaryObject), "mypy" + return node, idx + @property def named_destinations(self) -> Dict[str, Any]: """ @@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]: def _flatten( self, + list_only: bool = False, pages: Union[None, DictionaryObject, PageObject] = None, inherit: Optional[Dict[str, Any]] = None, indirect_reference: Optional[IndirectObject] = None, ) -> None: + """ + Prepare the document pages to ease searching + + Args: + list_only: Will only list the pages within _flatten_pages. + pages: + inherit: + indirect_reference: Used recursively to flatten the /Pages object. + """ inheritable_page_attributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), @@ -1122,7 +1171,7 @@ def _flatten( if obj: # damaged file may have invalid child in /Pages try: - self._flatten(obj, inherit, **addt) + self._flatten(list_only, obj, inherit, **addt) except RecursionError: raise PdfReadError( "Maximum recursion depth reached during page flattening." @@ -1134,7 +1183,8 @@ def _flatten( if attr_in not in pages: pages[attr_in] = value page_obj = PageObject(self, indirect_reference) - page_obj.update(pages) + if not list_only: + page_obj.update(pages) # TODO: Could flattened_pages be None at this point? self.flattened_pages.append(page_obj) # type: ignore @@ -1158,7 +1208,7 @@ def remove_page( or destinations to reference a detached page. """ if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None if isinstance(page, IndirectObject): p = page.get_object() diff --git a/pypdf/_page.py b/pypdf/_page.py index 17ec04477..d4ba13134 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -492,6 +492,22 @@ def __init__( self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] self.indirect_reference = indirect_reference + if indirect_reference is not None: + self.update(cast(DictionaryObject, indirect_reference.get_object())) + + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Note: this function is overloaded to return the same results + as a DictionaryObject. + + Returns: + Hash considering type and value. + """ + return hash( + (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) + ) def hash_value_data(self) -> bytes: data = super().hash_value_data() @@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None: raise IndexError("index out of range") ind = self[index].indirect_reference assert ind is not None - parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None) + parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( + "/Parent", None + ) + first = True while parent is not None: parent = cast(DictionaryObject, parent.get_object()) try: - i = parent["/Kids"].index(ind) - del parent["/Kids"][i] + i = cast(ArrayObject, parent["/Kids"]).index(ind) + del cast(ArrayObject, parent["/Kids"])[i] + first = False try: assert ind is not None del ind.pdf.flattened_pages[index] # case of page in a Reader except Exception: # pragma: no cover pass if "/Count" in parent: - parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1) - if len(parent["/Kids"]) == 0: + parent[NameObject("/Count")] = NumberObject( + cast(int, parent["/Count"]) - 1 + ) + if len(cast(ArrayObject, parent["/Kids"])) == 0: # No more objects in this part of this sub tree ind = parent.indirect_reference - parent = cast(DictionaryObject, parent.get("/Parent", None)) - else: - parent = None + parent = parent.get("/Parent", None) except ValueError: # from index - raise PdfReadError(f"Page Not Found in Page Tree {ind}") + if first: + raise PdfReadError(f"Page not found in page tree: {ind}") + break def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index b5fa14879..431db1a11 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: List[Any] _id_translated: Dict[int, Dict[int, int]] + incremental: bool + _reader: Any # PdfReader + @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: ... # pragma: no cover diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 1452661a5..1f7f79114 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -136,6 +136,7 @@ def __init__( with open(stream, "rb") as fh: stream = BytesIO(fh.read()) self._stream_opened = True + self._startxref: int = 0 self.read(stream) self.stream = stream @@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None: self._basic_validation(stream) self._find_eof_marker(stream) startxref = self._find_startxref_pos(stream) + self._startxref = startxref # check and eventually correct the startxref only in not strict xref_issue_nr = self._get_xref_issues(stream, startxref) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index a72e2a23d..823106fdc 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -31,6 +31,7 @@ import enum import hashlib import re +import struct import uuid from io import BytesIO, FileIO, IOBase from itertools import compress @@ -148,53 +149,71 @@ class PdfWriter(PdfDocCommon): cloning a PDF file during initialization. Typically data is added from a :class:`PdfReader`. + + Args: + clone_from: identical to fileobj (for compatibility) + + incremental: If true, loads the document and set the PdfWriter in incremental mode. + + When writing incrementally, the original document is written first and new/modified + content is appended. To be used for signed document/forms to keep signature valid. """ def __init__( self, fileobj: Union[None, PdfReader, StrByteType, Path] = "", clone_from: Union[None, PdfReader, StrByteType, Path] = None, + incremental: bool = False, ) -> None: - self._header = b"%PDF-1.3" + self.incremental = incremental + """ + Returns if the PdfWriter object has been started in incremental mode. + """ + self._objects: List[Optional[PdfObject]] = [] - """The indirect objects in the PDF.""" + """ + The indirect objects in the PDF. + For the incremental case, it will be filled with None + in clone_reader_document_root. + """ - """Maps hash values of indirect objects to the list of IndirectObjects. - This is used for compression. + self._original_hash: List[int] = [] + """ + List of hashes after import; used to identify changes. """ + self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} + """ + Maps hash values of indirect objects to the list of IndirectObjects. + This is used for compression. + """ + self._id_translated: Dict[int, Dict[int, int]] = {} """List of already translated IDs. dict[id(pdf)][(idnum, generation)] """ - self._id_translated: Dict[int, Dict[int, int]] = {} - - # The root of our page tree node. - pages = DictionaryObject() - pages.update( - { - NameObject(PA.TYPE): NameObject("/Pages"), - NameObject(PA.COUNT): NumberObject(0), - NameObject(PA.KIDS): ArrayObject(), - } - ) - self._pages = self._add_object(pages) - self.flattened_pages = [] - # info object - info = DictionaryObject() - info.update({NameObject("/Producer"): create_string_object("pypdf")}) - self._info_obj: PdfObject = self._add_object(info) - - # root object - self._root_object = DictionaryObject() - self._root_object.update( - { - NameObject(PA.TYPE): NameObject(CO.CATALOG), - NameObject(CO.PAGES): self._pages, - } - ) - self._root = self._add_object(self._root_object) + self._ID: Union[ArrayObject, None] = None + self._info_obj: PdfObject + + if self.incremental: + if isinstance(fileobj, (str, Path)): + with open(fileobj, "rb") as f: + fileobj = BytesIO(f.read(-1)) + if isinstance(fileobj, BytesIO): + fileobj = PdfReader(fileobj) + else: + raise PyPdfError("Invalid type for incremental mode") + self._reader = fileobj # prev content is in _reader.stream + self._header = fileobj.pdf_header.encode() + self._readonly = True # !!!TODO: to be analysed + else: + self._header = b"%PDF-1.3" + self._info_obj = self._add_object( + DictionaryObject( + {NameObject("/Producer"): create_string_object("pypdf")} + ) + ) def _get_clone_from( fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], @@ -227,14 +246,39 @@ def _get_clone_from( self.temp_fileobj = fileobj self.fileobj = "" self.with_as_usage = False + # The root of our page tree node. + pages = DictionaryObject() + pages.update( + { + NameObject(PA.TYPE): NameObject("/Pages"), + NameObject(PA.COUNT): NumberObject(0), + NameObject(PA.KIDS): ArrayObject(), + } + ) + self.flattened_pages = [] + self._encryption: Optional[Encryption] = None + self._encrypt_entry: Optional[DictionaryObject] = None + if clone_from is not None: if not isinstance(clone_from, PdfReader): clone_from = PdfReader(clone_from) self.clone_document_from_reader(clone_from) - - self._encryption: Optional[Encryption] = None - self._encrypt_entry: Optional[DictionaryObject] = None - self._ID: Union[ArrayObject, None] = None + else: + self._pages = self._add_object(pages) + # root object + self._root_object = DictionaryObject() + self._root_object.update( + { + NameObject(PA.TYPE): NameObject(CO.CATALOG), + NameObject(CO.PAGES): self._pages, + } + ) + self._add_object(self._root_object) + if isinstance(self._ID, list): + if isinstance(self._ID[0], TextStringObject): + self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) + if isinstance(self._ID[1], TextStringObject): + self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) # for commonality @property @@ -407,10 +451,12 @@ def _replace_object( def _add_page( self, page: PageObject, - action: Callable[[Any, Union[PageObject, IndirectObject]], None], + index: int, excluded_keys: Iterable[str] = (), ) -> PageObject: - assert cast(str, page[PA.TYPE]) == CO.PAGE + if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE: + raise ValueError("Invalid page object") + assert self.flattened_pages is not None, "for mypy" page_org = page excluded_keys = list(excluded_keys) excluded_keys += [PA.PARENT, "/StructParents"] @@ -424,17 +470,29 @@ def _add_page( ] except Exception: pass - page = cast("PageObject", page_org.clone(self, False, excluded_keys)) + page = cast( + "PageObject", page_org.clone(self, False, excluded_keys).get_object() + ) if page_org.pdf is not None: other = page_org.pdf.pdf_header self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) - page[NameObject(PA.PARENT)] = self._pages - pages = cast(DictionaryObject, self.get_object(self._pages)) - assert page.indirect_reference is not None - action(pages[PA.KIDS], page.indirect_reference) - action(self.flattened_pages, page) - page_count = cast(int, pages[PA.COUNT]) - pages[NameObject(PA.COUNT)] = NumberObject(page_count + 1) + node, idx = self._get_page_in_node(index) + page[NameObject(PA.PARENT)] = node.indirect_reference + + if idx >= 0: + cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference) + self.flattened_pages.insert(index, page) + else: + cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) + self.flattened_pages.append(page) + cpt = 1000 + while node is not None: + node = cast(DictionaryObject, node.get_object()) + node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) + node = node.get(PA.PARENT, None) + cpt -= 1 + if cpt < 0: + raise PyPdfError("Too many recursive calls!") return page def set_need_appearances_writer(self, state: bool = True) -> None: @@ -497,7 +555,8 @@ def add_page( Returns: The added PageObject. """ - return self._add_page(page, list.append, excluded_keys) + assert self.flattened_pages is not None, "mypy" + return self._add_page(page, len(self.flattened_pages), excluded_keys) def insert_page( self, @@ -517,7 +576,15 @@ def insert_page( Returns: The added PageObject. """ - return self._add_page(page, lambda kids, p: kids.insert(index, p)) + assert self.flattened_pages is not None, "mypy" + if index < 0: + index = len(self.flattened_pages) + index + if index < 0: + raise ValueError("Invalid index value") + if index >= len(self.flattened_pages): + return self.add_page(page, excluded_keys) + else: + return self._add_page(page, index, excluded_keys) def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] @@ -1115,18 +1182,29 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: Args: reader: PdfReader from which the document root should be copied. """ - self._objects.clear() + if self.incremental: + self._objects = [None] * cast(int, reader.trailer["/Size"]) + else: + self._objects.clear() self._root_object = reader.root_object.clone(self) - self._root = self._root_object.indirect_reference # type: ignore[assignment] self._pages = self._root_object.raw_get("/Pages") + + assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest + # must be done here before rewriting + if self.incremental: + self._original_hash = [ + (obj.hash_bin() if obj is not None else 0) for obj in self._objects + ] self._flatten() assert self.flattened_pages is not None for p in self.flattened_pages: - p[NameObject("/Parent")] = self._pages - self._objects[cast(IndirectObject, p.indirect_reference).idnum - 1] = p - cast(DictionaryObject, self._pages.get_object())[ - NameObject("/Kids") - ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) + self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) + if not self.incremental: + p[NameObject("/Parent")] = self._pages + if not self.incremental: + cast(DictionaryObject, self._pages.get_object())[ + NameObject("/Kids") + ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) def clone_document_from_reader( self, @@ -1148,13 +1226,28 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - self._info_obj = self._add_object(DictionaryObject()) if TK.INFO in reader.trailer: - self._info = reader._info # actually copy fields + inf = reader._info + if self.incremental: + if inf is not None: + self._info_obj = cast( + IndirectObject, inf.clone(self).indirect_reference + ) + self._original_hash[ + cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 + ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin() + elif inf is not None: + self._info_obj = self._add_object( + DictionaryObject(cast(DictionaryObject, inf.get_object())) + ) + else: + self._info_obj = self._add_object(DictionaryObject()) + try: self._ID = cast(ArrayObject, reader._ID).clone(self) except AttributeError: pass + if callable(after_page_append): for page in cast( ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] @@ -1257,9 +1350,17 @@ def write_stream(self, stream: StreamType) -> None: # self._root = self._add_object(self._root_object) # self._sweep_indirect_references(self._root) - object_positions, free_objects = self._write_pdf_structure(stream) - xref_location = self._write_xref_table(stream, object_positions, free_objects) - self._write_trailer(stream, xref_location) + if self.incremental: + self._reader.stream.seek(0) + stream.write(self._reader.stream.read(-1)) + if len(self.list_objects_in_increment()) > 0: + self._write_increment(stream) # writes objs, Xref stream and startx + else: + object_positions, free_objects = self._write_pdf_structure(stream) + xref_location = self._write_xref_table( + stream, object_positions, free_objects + ) + self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: """ @@ -1291,6 +1392,100 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: return my_file, stream + def list_objects_in_increment(self) -> List[IndirectObject]: + """ + For debugging/analysis. + Provides the list of new/modified objects that will be written + in the increment. + Deleted objects will not be freed but will become orphans. + + Returns: + List of (new / modified) IndirectObjects + """ + return [ + cast(IndirectObject, self._objects[i]).indirect_reference + for i in range(len(self._objects)) + if ( + self._objects[i] is not None + and ( + i >= len(self._original_hash) + or cast(PdfObject, self._objects[i]).hash_bin() + != self._original_hash[i] + ) + ) + ] + + def _write_increment(self, stream: StreamType) -> None: + object_positions = {} + object_blocks = [] + current_start = -1 + current_stop = -2 + for i, obj in enumerate(self._objects): + if self._objects[i] is not None and ( + i >= len(self._original_hash) + or cast(PdfObject, self._objects[i]).hash_bin() + != self._original_hash[i] + ): + idnum = i + 1 + assert isinstance(obj, PdfObject) # mypy + # first write new/modified object + object_positions[idnum] = stream.tell() + stream.write(f"{idnum} 0 obj\n".encode()) + """ encryption is not operational + if self._encryption and obj != self._encrypt_entry: + obj = self._encryption.encrypt_object(obj, idnum, 0) + """ + obj.write_to_stream(stream) + stream.write(b"\nendobj\n") + + # prepare xref + if idnum != current_stop: + if current_start > 0: + object_blocks.append( + [current_start, current_stop - current_start] + ) + current_start = idnum + current_stop = idnum + 1 + assert current_start > 0, "for pytest only" + object_blocks.append([current_start, current_stop - current_start]) + # write incremented xref + xref_location = stream.tell() + xr_id = len(self._objects) + 1 + stream.write(f"{xr_id} 0 obj".encode()) + init_data = { + NameObject("/Type"): NameObject("/XRef"), + NameObject("/Size"): NumberObject(xr_id + 1), + NameObject("/Root"): self.root_object.indirect_reference, + NameObject("/Filter"): NameObject("/FlateDecode"), + NameObject("/Index"): ArrayObject( + [NumberObject(_it) for _su in object_blocks for _it in _su] + ), + NameObject("/W"): ArrayObject( + [NumberObject(1), NumberObject(4), NumberObject(1)] + ), + "__streamdata__": b"", + } + if self._info is not None and ( + self._info.indirect_reference.idnum - 1 # type: ignore + >= len(self._original_hash) + or cast(IndirectObject, self._info).hash_bin() # kept for future + != self._original_hash[ + self._info.indirect_reference.idnum - 1 # type: ignore + ] + ): + init_data[NameObject(TK.INFO)] = self._info.indirect_reference + init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) + if self._ID: + init_data[NameObject(TK.ID)] = self._ID + xr = StreamObject.initialize_from_dictionary(init_data) + xr.set_data( + b"".join( + [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] + ) + ) + xr.write_to_stream(stream) + stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof + def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: object_positions = [] free_objects = [] # will contain list of all free entries @@ -1337,15 +1532,15 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: of certain special objects within the body of the file. """ stream.write(b"trailer\n") - trailer = DictionaryObject() - trailer.update( + trailer = DictionaryObject( { NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), - NameObject(TK.ROOT): self._root, - NameObject(TK.INFO): self._info_obj, + NameObject(TK.ROOT): self.root_object.indirect_reference, } ) - if self._ID: + if self._info is not None: + trailer[NameObject(TK.INFO)] = self._info.indirect_reference + if self._ID is not None: trailer[NameObject(TK.ID)] = self._ID if self._encrypt_entry: trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference diff --git a/pypdf/constants.py b/pypdf/constants.py index 745774e2a..d7a8e310f 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -33,6 +33,7 @@ class TrailerKeys: ID = "/ID" INFO = "/Info" SIZE = "/Size" + PREV = "/Prev" class CatalogAttributes: @@ -217,7 +218,9 @@ class PageAttributes: TYPE = "/Type" # name, required; must be /Page PARENT = "/Parent" # dictionary, required; a pages object - LAST_MODIFIED = "/LastModified" # date, optional; date and time of last modification + LAST_MODIFIED = ( + "/LastModified" # date, optional; date and time of last modification + ) RESOURCES = "/Resources" # dictionary, required if there are any MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size CROPBOX = "/CropBox" # rectangle, optional diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index f48dc66c3..d02a79810 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -53,6 +53,17 @@ class PdfObject(PdfObjectProtocol): hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 indirect_reference: Optional["IndirectObject"] + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement .hash_bin() so far" + ) + def hash_value_data(self) -> bytes: return ("%s" % self).encode() @@ -121,7 +132,15 @@ def _reference_clone( ind = self.indirect_reference except AttributeError: return clone - i = len(pdf_dest._objects) + 1 + if ( + pdf_dest.incremental + and ind is not None + and ind.pdf == pdf_dest._reader + and ind.idnum <= len(pdf_dest._objects) + ): + i = ind.idnum + else: + i = len(pdf_dest._objects) + 1 if ind is not None: if id(ind.pdf) not in pdf_dest._id_translated: pdf_dest._id_translated[id(ind.pdf)] = {} @@ -136,7 +155,11 @@ def _reference_clone( assert obj is not None return obj pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i - pdf_dest._objects.append(clone) + try: + pdf_dest._objects[i - 1] = clone + except IndexError: + pdf_dest._objects.append(clone) + i = len(pdf_dest._objects) clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone @@ -162,6 +185,15 @@ def clone( "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__,)) + def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: @@ -198,6 +230,15 @@ def clone( self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self.value)) + def __eq__(self, __o: object) -> bool: if isinstance(__o, BooleanObject): return self.value == __o.value @@ -242,6 +283,15 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader def __hash__(self) -> int: return hash((self.idnum, self.generation, id(self.pdf))) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) + def clone( self, pdf_dest: PdfWriterProtocol, @@ -400,6 +450,15 @@ def clone( self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self.as_numeric)) + def myrepr(self) -> str: if self == 0: return "0.0" @@ -445,6 +504,15 @@ def clone( self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self.as_numeric())) + def as_numeric(self) -> int: return int(repr(self).encode("utf8")) @@ -488,6 +556,15 @@ def clone( ), ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, bytes(self))) + @property def original_bytes(self) -> bytes: """For compatibility with TextStringObject.original_bytes.""" @@ -567,6 +644,15 @@ def clone( "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self.original_bytes)) + @property def original_bytes(self) -> bytes: """ @@ -663,6 +749,15 @@ def clone( self._reference_clone(NameObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, self)) + def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9ddd28d66..08bc2806d 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -131,6 +131,15 @@ def clone( arr.append(data) return arr + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash((self.__class__, tuple(x.hash_bin() for x in self))) + def items(self) -> Iterable[Any]: """Emulate DictionaryObject.items for a list (index, object).""" return enumerate(self) @@ -371,6 +380,17 @@ def _clone( else v ) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + return hash( + (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items()))) + ) + def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) @@ -876,6 +896,16 @@ def _clone( pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + def hash_bin(self) -> int: + """ + Used to detect modified object. + + Returns: + Hash considering type and value. + """ + # use of _data to prevent errors on non decoded stream such as JBIG2 + return hash((super().hash_bin(), self._data)) + def get_data(self) -> bytes: return self._data @@ -921,7 +951,8 @@ def initialize_from_dictionary( retval = DecodedStreamObject() retval._data = data["__streamdata__"] del data["__streamdata__"] - del data[SA.LENGTH] + if SA.LENGTH in data: + del data[SA.LENGTH] retval.update(data) return retval diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index a53f28f0be432c38a1fff33672a2170eeb5f553f..8a04001ddae371fa756d1dc2f607fd42965f0f8f 100644 GIT binary patch delta 94 zcmZo&z}vcjcY^f925Cm4i2^ATXDb5fy%!jbHuo{k?gCO;J%WZn%4YBN)r>~lMKT!= s&IQuf*qG*o1L@*8CW#mz-Ls2HXaci=smb<*Ud%46jE2*n`7_G{06VfE8vp Date: Fri, 13 Sep 2024 09:36:48 +0200 Subject: [PATCH 27/42] ENH: Robustify parsing for Object streams in XRef rebuilding (#2818) Closes #2817. --- pypdf/_reader.py | 38 ++++++++++++++++++++++++++++++++++++-- tests/test_filters.py | 8 ++------ tests/test_reader.py | 42 +++++++++++++++++++++++++++++++++++------- 3 files changed, 73 insertions(+), 15 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 1f7f79114..58c160302 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -77,6 +77,7 @@ NullObject, NumberObject, PdfObject, + StreamObject, TextStringObject, read_object, ) @@ -316,8 +317,6 @@ def _get_object_from_stream( obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore # This is an xref to a stream, so its type better be a stream assert cast(str, obj_stm["/Type"]) == "/ObjStm" - # /N is the number of indirect objects in the stream - assert idx < obj_stm["/N"] stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) @@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None: if generation not in self.xref: self.xref[generation] = {} self.xref[generation][idnum] = m.start(1) + + logger_warning("parsing for Object Streams", __name__) + for g in self.xref: + for i in self.xref[g]: + # get_object in manual + stream.seek(self.xref[g][i], 0) + try: + _ = self.read_object_header(stream) + o = cast(StreamObject, read_object(stream, self)) + if o.get("/Type", "") != "/ObjStm": + continue + strm = BytesIO(o.get_data()) + cpt = 0 + while True: + s = read_until_whitespace(strm) + if not s.isdigit(): + break + _i = int(s) + skip_over_whitespace(strm) + strm.seek(-1, 1) + s = read_until_whitespace(strm) + if not s.isdigit(): # pragma: no cover + break # pragma: no cover + _o = int(s) + self.xref_objStm[_i] = (i, _o) + cpt += 1 + if cpt != o.get("/N"): # pragma: no cover + logger_warning( # pragma: no cover + f"found {cpt} objects within Object({i},{g})" + f" whereas {o.get('/N')} expected", + __name__, + ) + except Exception: # could be of many cause + pass + stream.seek(0, 0) for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): stream.seek(m.start(1), 0) diff --git a/tests/test_filters.py b/tests/test_filters.py index 146ce43cb..632095888 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -5,7 +5,6 @@ from io import BytesIO from itertools import product as cartesian_product from pathlib import Path -from unittest.mock import patch import pytest from PIL import Image @@ -225,14 +224,11 @@ def test_ccitt_fax_decode(): @pytest.mark.enable_socket() -@patch("pypdf._reader.logger_warning") -def test_decompress_zlib_error(mock_logger_warning): +def test_decompress_zlib_error(caplog): reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf"))) for page in reader.pages: page.extract_text() - mock_logger_warning.assert_called_with( - "incorrect startxref pointer(3)", "pypdf._reader" - ) + assert "incorrect startxref pointer(3)" in caplog.text @pytest.mark.enable_socket() diff --git a/tests/test_reader.py b/tests/test_reader.py index aeb742372..99555cd22 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -276,14 +276,22 @@ def test_get_images(src, expected_images): False, 0, False, - ["startxref on same line as offset", "incorrect startxref pointer(1)"], + [ + "startxref on same line as offset", + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ], ), # error on startxref, but no strict => xref rebuilt,no fail ( False, True, 0, False, - ["startxref on same line as offset", "incorrect startxref pointer(1)"], + [ + "startxref on same line as offset", + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ], ), ], ) @@ -344,7 +352,10 @@ def test_issue297(caplog): assert caplog.text == "" assert "Broken xref table" in exc.value.args[0] reader = PdfReader(path, strict=False) - assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"] + assert normalize_warnings(caplog.text) == [ + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ] reader.pages[0] @@ -898,11 +909,14 @@ def test_form_topname_with_and_without_acroform(caplog): def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf" - msg = "incorrect startxref pointer(2)" + msg = [ + "incorrect startxref pointer(2)", + "parsing for Object Streams", + ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf"))) for page in reader.pages: page.extract_text() - assert normalize_warnings(caplog.text) == [msg] + assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket() @@ -910,11 +924,13 @@ def test_extract_text_xref_issue_2(caplog): def test_extract_text_xref_issue_3(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf" - msg = "incorrect startxref pointer(3)" + msg = [ + "incorrect startxref pointer(3)", + ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf"))) for page in reader.pages: page.extract_text() - assert normalize_warnings(caplog.text) == [msg] + assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket() @@ -1589,3 +1605,15 @@ def test_iss2761(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) with pytest.raises(PdfReadError): reader.pages[0].extract_text() + + +@pytest.mark.enable_socket() +def test_iss2817(): + """Test for rebuiling Xref_ObjStm""" + url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf" + name = "iss2817.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert ( + reader.pages[0]["/Annots"][0].get_object()["/Contents"] + == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B" + ) From c4e95bd0760dc4c1e593200ba3c2525978dd7543 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Fri, 13 Sep 2024 12:30:55 +0100 Subject: [PATCH 28/42] STY: Use f-string = functionality (#2835) * STY: Use f-string = functionality * STY: Use f-string = functionality * STY: Use f-string = functionality Also switch the order of a tuple to match the order of the line above. --------- Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- pypdf/annotations/_non_markup_annotations.py | 2 +- pypdf/generic/_rectangle.py | 2 +- tests/test_generic.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py index 6272cceee..af02223e7 100644 --- a/pypdf/annotations/_non_markup_annotations.py +++ b/pypdf/annotations/_non_markup_annotations.py @@ -36,7 +36,7 @@ def __init__( if is_external and is_internal: raise ValueError( "Either 'url' or 'target_page_index' have to be provided. " - f"url={url}, target_page_index={target_page_index}" + f"{url=}, {target_page_index=}" ) border_arr: BorderArrayType diff --git a/pypdf/generic/_rectangle.py b/pypdf/generic/_rectangle.py index 690b52172..c1f22cebc 100644 --- a/pypdf/generic/_rectangle.py +++ b/pypdf/generic/_rectangle.py @@ -26,7 +26,7 @@ def __init__( ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]: - if not isinstance(value, (NumberObject, FloatObject)): + if not isinstance(value, (FloatObject, NumberObject)): value = FloatObject(value) return value diff --git a/tests/test_generic.py b/tests/test_generic.py index 2616ec6af..190bb25e6 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -981,7 +981,7 @@ def test_annotation_builder_link(pdf_file_path): ) assert exc.value.args[0] == ( "Either 'url' or 'target_page_index' have to be provided. " - "url=https://martin-thoma.com/, target_page_index=3" + "url='https://martin-thoma.com/', target_page_index=3" ) # Part 2: Too few args From 78baa8f30bf9f2acabd20c9efdc26a4b81042999 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Sat, 14 Sep 2024 18:56:00 +1000 Subject: [PATCH 29/42] BUG: Warn when visitor* arguments are ignored (#2845) visitor* function arguments are silently ignored when extraction_mode="layout". Document this a bit better and add a warning when these arguments are ignored. Closes #2840. --- pypdf/_page.py | 14 ++++++++++++++ tests/test_text_extraction.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index d4ba13134..471256eec 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2172,19 +2172,24 @@ def extract_text( default = (0, 90, 180, 270) note: currently only 0 (up),90 (turned left), 180 (upside down), 270 (turned right) + Silently ignored in "layout" mode. space_width: force default space width if not extracted from font (default: 200) + Silently ignored in "layout" mode. visitor_operand_before: function to be called before processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. + Ignored with a warning in "layout" mode. visitor_operand_after: function to be called after processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. + Ignored with a warning in "layout" mode. visitor_text: function to be called when extracting some text at some position. It has five arguments: text, current transformation matrix, text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + Ignored with a warning in "layout" mode. extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, "layout" for experimental layout mode functionality. NOTE: orientations, space_width, and visitor_* parameters are NOT respected @@ -2213,6 +2218,15 @@ def extract_text( if extraction_mode not in ["plain", "layout"]: raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") if extraction_mode == "layout": + for visitor in ("visitor_operand_before", + "visitor_operand_after", + "visitor_text", + ): + if locals()[visitor]: + logger_warning( + f"Argument {visitor} is ignored in layout mode", + __name__, + ) return self._layout_mode_text( space_vertically=kwargs.get("layout_mode_space_vertically", True), scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index dcd4e6cae..2f0eaad1d 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -5,6 +5,7 @@ """ from io import BytesIO from pathlib import Path +from unittest.mock import patch import pytest @@ -173,3 +174,18 @@ def test_layout_mode_indirect_sequence_font_widths(): with pytest.raises(ParseError) as exc: reader.pages[0].extract_text(extraction_mode="layout") assert str(exc.value).startswith("Invalid font width definition") + +def dummy_visitor_text(text, ctm, tm, fd, fs): + pass + +@patch("pypdf._page.logger_warning") +def test_layout_mode_warnings(mock_logger_warning): + # Check that a warning is issued when an argument is ignored + reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") + page = reader.pages[0] + page.extract_text(extraction_mode="plain", visitor_text=dummy_visitor_text) + mock_logger_warning.assert_not_called() + page.extract_text(extraction_mode="layout", visitor_text=dummy_visitor_text) + mock_logger_warning.assert_called_with( + "Argument visitor_text is ignored in layout mode", "pypdf._page" + ) From a7905322ad0307e47ae1d1a00717a7ecb14b1786 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 14 Sep 2024 13:19:19 +0200 Subject: [PATCH 30/42] ENH: Add capability to remove /Info from PDF (#2820) --- docs/user/metadata.md | 24 ++++++++++++ pypdf/_writer.py | 85 ++++++++++++++++++++++++++++++++----------- tests/test_writer.py | 36 +++++++++++++++++- 3 files changed, 121 insertions(+), 24 deletions(-) diff --git a/docs/user/metadata.md b/docs/user/metadata.md index 7f0a57694..a2bbdf9f0 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -76,6 +76,30 @@ writer.add_metadata( } ) +# Clear all data but keep the entry in PDF +writer.metadata = {} + +# Replace all entries with new set of entries +writer.metadata = { + "/Author": "Martin", + "/Producer": "Libre Writer", +} + +# Save the new PDF to a file +with open("meta-pdf.pdf", "wb") as f: + writer.write(f) +``` + +## Removing metadata entry + +```python +from pypdf import PdfWriter + +writer = PdfWriter("example.pdf") + +# Remove Metadata (/Info entry) +writer.metadata = None + # Save the new PDF to a file with open("meta-pdf.pdf", "wb") as f: writer.write(f) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 823106fdc..edcd391e4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -53,7 +53,7 @@ ) from ._cmap import _default_fonts_space_width, build_char_map_from_dict -from ._doc_common import PdfDocCommon +from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject from ._page_labels import nums_clear_range, nums_insert, nums_next @@ -194,7 +194,7 @@ def __init__( """ self._ID: Union[ArrayObject, None] = None - self._info_obj: PdfObject + self._info_obj: Optional[PdfObject] if self.incremental: if isinstance(fileobj, (str, Path)): @@ -309,13 +309,26 @@ def _info(self) -> Optional[DictionaryObject]: Returns: /Info Dictionary; None if the entry does not exist """ - return cast(DictionaryObject, self._info_obj.get_object()) + return ( + None + if self._info_obj is None + else cast(DictionaryObject, self._info_obj.get_object()) + ) @_info.setter - def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None: - obj = cast(DictionaryObject, self._info_obj.get_object()) - obj.clear() - obj.update(cast(DictionaryObject, value.get_object())) + def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: + if value is None: + try: + self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore + except (KeyError, AttributeError): + pass + self._info_obj = None + else: + if self._info_obj is None: + self._info_obj = self._add_object(DictionaryObject()) + obj = cast(DictionaryObject, self._info_obj.get_object()) + obj.clear() + obj.update(cast(DictionaryObject, value.get_object())) @property def xmp_metadata(self) -> Optional[XmpInformation]: @@ -1186,6 +1199,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: self._objects = [None] * cast(int, reader.trailer["/Size"]) else: self._objects.clear() + self._info_obj = None self._root_object = reader.root_object.clone(self) self._pages = self._root_object.raw_get("/Pages") @@ -1226,22 +1240,21 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - if TK.INFO in reader.trailer: - inf = reader._info - if self.incremental: - if inf is not None: - self._info_obj = cast( - IndirectObject, inf.clone(self).indirect_reference - ) - self._original_hash[ - cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 - ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin() - elif inf is not None: - self._info_obj = self._add_object( - DictionaryObject(cast(DictionaryObject, inf.get_object())) + inf = reader._info + if self.incremental: + if inf is not None: + self._info_obj = cast( + IndirectObject, inf.clone(self).indirect_reference ) - else: - self._info_obj = self._add_object(DictionaryObject()) + assert isinstance(self._info, DictionaryObject), "for mypy" + self._original_hash[ + self._info_obj.indirect_reference.idnum - 1 + ] = self._info.hash_bin() + elif inf is not None: + self._info_obj = self._add_object( + DictionaryObject(cast(DictionaryObject, inf.get_object())) + ) + # else: _info_obj = None done in clone_reader_document_root() try: self._ID = cast(ArrayObject, reader._ID).clone(self) @@ -1547,6 +1560,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: trailer.write_to_stream(stream) stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve/set the PDF file's document information dictionary, if it exists. + + Args: + value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF. + + Note that some PDF files use (XMP) metadata streams instead of document + information dictionaries, and these metadata streams will not be + accessed by this function. + """ + return super().metadata + + @metadata.setter + def metadata( + self, + value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]], + ) -> None: + if value is None: + self._info = None + else: + if self._info is not None: + self._info.clear() + else: + self._info = DictionaryObject() + self.add_metadata(value) + def add_metadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. diff --git a/tests/test_writer.py b/tests/test_writer.py index 7b9cbf003..e06db389b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1795,9 +1795,33 @@ def test_missing_info(): writer = PdfWriter(clone_from=reader) assert len(writer.pages) == len(reader.pages) + assert writer.metadata is None + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - writer._info = reader._info + writer.metadata = reader.metadata assert dict(writer._info) == dict(reader._info) + assert writer.metadata == reader.metadata + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + + writer.metadata = {} + writer._info = DictionaryObject() # for code coverage + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + assert writer.metadata == {} + + writer.metadata = None + writer.metadata = None # for code coverage + assert writer.metadata is None + assert PdfWriter().metadata == {"/Producer": "pypdf"} + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() @pytest.mark.enable_socket() @@ -2417,6 +2441,8 @@ def test_increment_writer(caplog): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == 1 + b = BytesIO() + writer.write(b) writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) # 1 object is modified: page 0 inherits MediaBox so is changed @@ -2438,7 +2464,13 @@ def test_increment_writer(caplog): # clone without info writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None + writer.metadata = {} + assert writer.metadata == {} assert len(writer.list_objects_in_increment()) == 1 - assert writer._info == {} + writer.metadata = None + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None b = BytesIO() writer.write(b) From 1bbc301a9bd4348107a8011486355f7ba6b7ef65 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 14 Sep 2024 14:20:59 +0200 Subject: [PATCH 31/42] MAINT: Deprecate PdfMerger, AnnotationBuilder and other deprecations cleanup (#2813) --- pypdf/_merger.py | 650 +----------------- pypdf/_writer.py | 16 +- pypdf/annotations/_markup_annotations.py | 2 +- pypdf/constants.py | 42 -- pypdf/filters.py | 13 - pypdf/generic/__init__.py | 274 +------- pypdf/generic/_data_structures.py | 29 +- ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217093 -> 217152 bytes tests/test_annotations.py | 270 +++++++- tests/test_encryption.py | 2 +- tests/test_generic.py | 379 +--------- tests/test_merger.py | 262 +------ tests/test_workflows.py | 11 +- tests/test_writer.py | 8 +- 14 files changed, 345 insertions(+), 1613 deletions(-) diff --git a/pypdf/_merger.py b/pypdf/_merger.py index a52a354e3..b6a830402 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -25,69 +25,10 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -from io import BytesIO, FileIO, IOBase -from pathlib import Path -from types import TracebackType -from typing import ( - Any, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, - Union, - cast, -) -from ._encryption import Encryption -from ._page import PageObject -from ._reader import PdfReader from ._utils import ( - StrByteType, - deprecate_with_replacement, -) -from ._writer import PdfWriter -from .constants import GoToActionArguments, TypArguments, TypFitArguments -from .constants import PagesAttributes as PA -from .generic import ( - PAGE_FIT, - ArrayObject, - Destination, - DictionaryObject, - Fit, - FloatObject, - IndirectObject, - NameObject, - NullObject, - NumberObject, - OutlineItem, - TextStringObject, - TreeObject, + deprecation_with_replacement, ) -from .pagerange import PageRange, PageRangeSpec -from .types import LayoutType, OutlineType, PagemodeType - -ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore" - - -class _MergedPage: - """Collect necessary information on each page that is being merged.""" - - def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: - self.src = src - self.pagedata = pagedata - self.out_pagedata = None - self.id = id - - -# transfered from _utils : as this function is only required here -# and merger will be soon deprecated -def str_(b: Any) -> str: # pragma: no cover - if isinstance(b, bytes): - return b.decode("latin-1") - else: - return str(b) # will return b.__str__() if defined class PdfMerger: @@ -97,590 +38,5 @@ class PdfMerger: .. deprecated:: 5.0.0 """ - def __init__( - self, strict: bool = False, fileobj: Union[Path, StrByteType] = "" - ) -> None: - deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0") - self.inputs: List[Tuple[Any, PdfReader]] = [] - self.pages: List[Any] = [] - self.output: Optional[PdfWriter] = PdfWriter() - self.outline: OutlineType = [] - self.named_dests: List[Any] = [] - self.id_count = 0 - self.fileobj = fileobj - self.strict = strict - - def __enter__(self) -> "PdfMerger": - # There is nothing to do. - deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0") - return self - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc: Optional[BaseException], - traceback: Optional[TracebackType], - ) -> None: - """Write to the fileobj and close the merger.""" - if self.fileobj: - self.write(self.fileobj) - self.close() - - def merge( - self, - page_number: int, - fileobj: Union[Path, StrByteType, PdfReader], - outline_item: Optional[str] = None, - pages: Optional[PageRangeSpec] = None, - import_outline: bool = True, - ) -> None: - """ - Merge the pages from the given file into the output file at the - specified page number. - - Args: - page_number: The *page number* to insert this file. File will - be inserted after the given number. - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - outline_item: Optionally, you may specify an outline item - (previously referred to as a 'bookmark') to be applied at the - beginning of the included file by supplying the text of the outline item. - pages: can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - to merge only the specified range of pages from the source - document into the output document. - Can also be a list of pages to merge. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. - """ - stream, encryption_obj = self._create_stream(fileobj) - - # Create a new PdfReader instance using the stream - # (either file or BytesIO or StringIO) created above - reader = PdfReader(stream, strict=self.strict) # type: ignore[arg-type] - self.inputs.append((stream, reader)) - if encryption_obj is not None: - reader._encryption = encryption_obj - - # Find the range of pages to merge. - if pages is None: - pages = (0, len(reader.pages)) - elif isinstance(pages, PageRange): - pages = pages.indices(len(reader.pages)) - elif isinstance(pages, list): - pass - elif not isinstance(pages, tuple): - raise TypeError('"pages" must be a tuple of (start, stop[, step])') - - srcpages = [] - - outline = [] - if import_outline: - outline = reader.outline - outline = self._trim_outline(reader, outline, pages) - - if outline_item: - outline_item_typ = OutlineItem( - TextStringObject(outline_item), - NumberObject(self.id_count), - Fit.fit(), - ) - self.outline += [outline_item_typ, outline] # type: ignore - else: - self.outline += outline - - dests = reader.named_destinations - trimmed_dests = self._trim_dests(reader, dests, pages) - self.named_dests += trimmed_dests - - # Gather all the pages that are going to be merged - for i in range(*pages): - page = reader.pages[i] - - id = self.id_count - self.id_count += 1 - - mp = _MergedPage(page, reader, id) - - srcpages.append(mp) - - self._associate_dests_to_pages(srcpages) - self._associate_outline_items_to_pages(srcpages) - - # Slice to insert the pages at the specified page_number - self.pages[page_number:page_number] = srcpages - - def _create_stream( - self, fileobj: Union[Path, StrByteType, PdfReader] - ) -> Tuple[IOBase, Optional[Encryption]]: - # If the fileobj parameter is a string, assume it is a path - # and create a file object at that location. If it is a file, - # copy the file's contents into a BytesIO stream object; if - # it is a PdfReader, copy that reader's stream into a - # BytesIO stream. - # If fileobj is none of the above types, it is not modified - encryption_obj = None - stream: IOBase - if isinstance(fileobj, (str, Path)): - stream = FileIO(fileobj, "rb") - elif isinstance(fileobj, PdfReader): - if fileobj._encryption: - encryption_obj = fileobj._encryption - orig_tell = fileobj.stream.tell() - fileobj.stream.seek(0) - stream = BytesIO(fileobj.stream.read()) - - # reset the stream to its original location - fileobj.stream.seek(orig_tell) - elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): - fileobj.seek(0) - file_content = fileobj.read() - stream = BytesIO(file_content) - else: - raise NotImplementedError( - "PdfMerger.merge requires an object that PdfReader can parse. " - "Typically, that is a Path or a string representing a Path, " - "a file object, or an object implementing .seek and .read. " - "Passing a PdfReader directly works as well." - ) - return stream, encryption_obj - - def append( - self, - fileobj: Union[StrByteType, PdfReader, Path], - outline_item: Optional[str] = None, - pages: Union[ - None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] - ] = None, - import_outline: bool = True, - ) -> None: - """ - Identical to the :meth:`merge()` method, but assumes you want to - concatenate all pages onto the end of the file instead of specifying a - position. - - Args: - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - outline_item: Optionally, you may specify an outline item - (previously referred to as a 'bookmark') to be applied at the - beginning of the included file by supplying the text of the outline item. - pages: can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - to merge only the specified range of pages from the source - document into the output document. - Can also be a list of pages to append. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. - """ - self.merge(len(self.pages), fileobj, outline_item, pages, import_outline) - - def write(self, fileobj: Union[Path, StrByteType]) -> None: - """ - Write all data that has been merged to the given output file. - - Args: - fileobj: Output file. Can be a filename or any kind of - file-like object. - """ - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - - # Add pages to the PdfWriter - # The commented out line below was replaced with the two lines below it - # to allow PdfMerger to work with PyPdf 1.13 - for page in self.pages: - self.output.add_page(page.pagedata) - pages_obj = cast(Dict[str, Any], self.output._pages.get_object()) - page.out_pagedata = self.output.get_reference( - pages_obj[PA.KIDS][-1].get_object() - ) - - # Once all pages are added, create outline items to point at those pages - self._write_dests() - self._write_outline() - - # Write the output to the file - my_file, ret_fileobj = self.output.write(fileobj) - - if my_file: - ret_fileobj.close() - - def close(self) -> None: - """Shut all file descriptors (input and output) and clear all memory usage.""" - self.pages = [] - for file_descriptor, _reader in self.inputs: - file_descriptor.close() - - self.inputs = [] - self.output = None - - def add_metadata(self, infos: Dict[str, Any]) -> None: - """ - Add custom metadata to the output. - - Args: - infos: a Python dictionary where each key is a field - and each value is your new metadata. - An example is ``{'/Title': 'My title'}`` - """ - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - self.output.add_metadata(infos) - - def set_page_layout(self, layout: LayoutType) -> None: - """ - Set the page layout. - - Args: - layout: The page layout to be used - - .. list-table:: Valid ``layout`` arguments - :widths: 50 200 - - * - /NoLayout - - Layout explicitly not specified - * - /SinglePage - - Show one page at a time - * - /OneColumn - - Show one column at a time - * - /TwoColumnLeft - - Show pages in two columns, odd-numbered pages on the left - * - /TwoColumnRight - - Show pages in two columns, odd-numbered pages on the right - * - /TwoPageLeft - - Show two pages at a time, odd-numbered pages on the left - * - /TwoPageRight - - Show two pages at a time, odd-numbered pages on the right - """ - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - self.output._set_page_layout(layout) - - def set_page_mode(self, mode: PagemodeType) -> None: - """ - Set the page mode. - - Args: - mode: The page mode to use. - - .. list-table:: Valid ``mode`` arguments - :widths: 50 200 - - * - /UseNone - - Do not show outline or thumbnails panels - * - /UseOutlines - - Show outline (aka bookmarks) panel - * - /UseThumbs - - Show page thumbnails panel - * - /FullScreen - - Fullscreen view - * - /UseOC - - Show Optional Content Group (OCG) panel - * - /UseAttachments - - Show attachments panel - """ - self.page_mode = mode - - @property - def page_mode(self) -> Optional[PagemodeType]: - """ - Set the page mode. - - Args: - mode: The page mode to use. - - .. list-table:: Valid ``mode`` arguments - :widths: 50 200 - - * - /UseNone - - Do not show outline or thumbnails panels - * - /UseOutlines - - Show outline (aka bookmarks) panel - * - /UseThumbs - - Show page thumbnails panel - * - /FullScreen - - Fullscreen view - * - /UseOC - - Show Optional Content Group (OCG) panel - * - /UseAttachments - - Show attachments panel - """ - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - return self.output.page_mode - - @page_mode.setter - def page_mode(self, mode: PagemodeType) -> None: - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - self.output.page_mode = mode - - def _trim_dests( - self, - pdf: PdfReader, - dests: Dict[str, Dict[str, Any]], - pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], - ) -> List[Dict[str, Any]]: - """ - Remove named destinations that are not a part of the specified page set. - - Args: - pdf: - dests: - pages: - """ - new_dests = [] - lst = pages if isinstance(pages, list) else list(range(*pages)) - for key, obj in dests.items(): - for j in lst: - if pdf.pages[j].get_object() == obj["/Page"].get_object(): - obj[NameObject("/Page")] = obj["/Page"].get_object() - assert str_(key) == str_(obj["/Title"]) - new_dests.append(obj) - break - return new_dests - - def _trim_outline( - self, - pdf: PdfReader, - outline: OutlineType, - pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], - ) -> OutlineType: - """ - Remove outline item entries that are not a part of the specified page set. - - Args: - pdf: - outline: - pages: - - Returns: - An outline type - """ - new_outline = [] - prev_header_added = True - lst = pages if isinstance(pages, list) else list(range(*pages)) - for i, outline_item in enumerate(outline): - if isinstance(outline_item, list): - sub = self._trim_outline(pdf, outline_item, lst) # type: ignore - if sub: - if not prev_header_added: - new_outline.append(outline[i - 1]) - new_outline.append(sub) # type: ignore - else: - prev_header_added = False - for j in lst: - if outline_item["/Page"] is None: - continue - if pdf.pages[j].get_object() == outline_item["/Page"].get_object(): - outline_item[NameObject("/Page")] = outline_item[ - "/Page" - ].get_object() - new_outline.append(outline_item) - prev_header_added = True - break - return new_outline - - def _write_dests(self) -> None: - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - for named_dest in self.named_dests: - page_index = None - if "/Page" in named_dest: # deprecated - for page_index, page in enumerate(self.pages): # noqa: B007 - if page.id == named_dest["/Page"]: - named_dest[NameObject("/Page")] = page.out_pagedata - break - - if page_index is not None: # deprecated - self.output.add_named_destination_object(named_dest) - - def _write_outline( - self, - outline: Optional[Iterable[OutlineItem]] = None, - parent: Optional[TreeObject] = None, - ) -> None: - if self.output is None: - raise RuntimeError(ERR_CLOSED_WRITER) - if outline is None: - outline = self.outline # type: ignore - assert outline is not None, "hint for mypy" # TODO: is that true? - - last_added = None - for outline_item in outline: - if isinstance(outline_item, list): - self._write_outline(outline_item, last_added) - continue - - page_no = None - if "/Page" in outline_item: - for page_no, page in enumerate(self.pages): # noqa: B007 - if page.id == outline_item["/Page"]: - self._write_outline_item_on_page(outline_item, page) - break - if page_no is not None: - del outline_item["/Page"], outline_item["/Type"] - last_added = self.output.add_outline_item_dict(outline_item, parent) - - def _write_outline_item_on_page( - self, outline_item: Union[OutlineItem, Destination], page: _MergedPage - ) -> None: - oi_type = cast(str, outline_item["/Type"]) - args = [NumberObject(page.id), NameObject(oi_type)] - fit2arg_keys: Dict[str, Tuple[str, ...]] = { - TypFitArguments.FIT_H: (TypArguments.TOP,), - TypFitArguments.FIT_BH: (TypArguments.TOP,), - TypFitArguments.FIT_V: (TypArguments.LEFT,), - TypFitArguments.FIT_BV: (TypArguments.LEFT,), - TypFitArguments.XYZ: (TypArguments.LEFT, TypArguments.TOP, "/Zoom"), - TypFitArguments.FIT_R: ( - TypArguments.LEFT, - TypArguments.BOTTOM, - TypArguments.RIGHT, - TypArguments.TOP, - ), - } - for arg_key in fit2arg_keys.get(oi_type, ()): - if arg_key in outline_item and not isinstance( - outline_item[arg_key], NullObject - ): - args.append(FloatObject(outline_item[arg_key])) - else: - args.append(FloatObject(0)) - del outline_item[arg_key] - - outline_item[NameObject("/A")] = DictionaryObject( - { - NameObject(GoToActionArguments.S): NameObject("/GoTo"), - NameObject(GoToActionArguments.D): ArrayObject(args), - } - ) - - def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: - for named_dest in self.named_dests: - page_index = None - np = named_dest["/Page"] - - if isinstance(np, NumberObject): - continue - - for page in pages: - if np.get_object() == page.pagedata.get_object(): - page_index = page.id - - if page_index is None: # deprecated - raise ValueError( - f"Unresolved named destination '{named_dest['/Title']}'" - ) - named_dest[NameObject("/Page")] = NumberObject(page_index) - - def _associate_outline_items_to_pages( - self, pages: List[_MergedPage], outline: Optional[Iterable[OutlineItem]] = None - ) -> None: - if outline is None: - outline = self.outline # type: ignore # TODO: self.bookmarks can be None! - assert outline is not None, "hint for mypy" - for outline_item in outline: - if isinstance(outline_item, list): - self._associate_outline_items_to_pages(pages, outline_item) - continue - - page_index = None - outline_item_page = outline_item["/Page"] - - if isinstance(outline_item_page, NumberObject): - continue - - for p in pages: - if outline_item_page.get_object() == p.pagedata.get_object(): - page_index = p.id - - if page_index is not None: - outline_item[NameObject("/Page")] = NumberObject(page_index) - - def find_outline_item( - self, - outline_item: Dict[str, Any], - root: Optional[OutlineType] = None, - ) -> Optional[List[int]]: - if root is None: - root = self.outline - - for i, oi_enum in enumerate(root): - if isinstance(oi_enum, list): - # oi_enum is still an inner node - # (OutlineType, if recursive types were supported by mypy) - res = self.find_outline_item(outline_item, oi_enum) # type: ignore - if res: # deprecated - return [i] + res - elif ( - oi_enum == outline_item - or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item - ): - # we found a leaf node - return [i] - - return None - - def add_outline_item( - self, - title: str, - page_number: int, - parent: Union[None, TreeObject, IndirectObject] = None, - color: Optional[Tuple[float, float, float]] = None, - bold: bool = False, - italic: bool = False, - fit: Fit = PAGE_FIT, - ) -> IndirectObject: - """ - Add an outline item (commonly referred to as a "Bookmark") to this PDF file. - - Args: - title: Title to use for this outline item. - page_number: Page number this outline item will point to. - parent: A reference to a parent outline item to create nested - outline items. - color: Color of the outline item's font as a red, green, blue tuple - from 0.0 to 1.0 - bold: Outline item font is bold - italic: Outline item font is italic - fit: The fit of the destination page. - """ - writer = self.output - if writer is None: - raise RuntimeError(ERR_CLOSED_WRITER) - return writer.add_outline_item( - title, - page_number, - parent, - None, - color, - bold, - italic, - fit, - ) - - def add_named_destination( - self, - title: str, - page_number: int, - ) -> None: - """ - Add a destination to the output. - - Args: - title: Title to use - page_number: Page number this destination points at. - """ - dest = Destination( - TextStringObject(title), - NumberObject(page_number), - Fit.fit_horizontally(top=826), - ) - self.named_dests.append(dest) + def __init__(self) -> None: + deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0") diff --git a/pypdf/_writer.py b/pypdf/_writer.py index edcd391e4..1e6cb9e26 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -63,7 +63,7 @@ StreamType, _get_max_pdf_version_header, deprecate, - deprecate_with_replacement, + deprecation_with_replacement, logger_warning, ) from .constants import AnnotationDictionaryAttributes as AA @@ -1847,6 +1847,7 @@ def add_outline_item_dict( outline_item_object = TreeObject() outline_item_object.update(outline_item) + """code currently unreachable if "/A" in outline_item: action = DictionaryObject() a_dict = cast(DictionaryObject, outline_item["/A"]) @@ -1854,7 +1855,7 @@ def add_outline_item_dict( action[NameObject(str(k))] = v action_ref = self._add_object(action) outline_item_object[NameObject("/A")] = action_ref - + """ return self.add_outline_item_destination( outline_item_object, parent, before, is_open ) @@ -2541,7 +2542,7 @@ def _create_stream( stream = BytesIO(filecontent) else: raise NotImplementedError( - "PdfMerger.merge requires an object that PdfReader can parse. " + "Merging requires an object that PdfReader can parse. " "Typically, that is a Path or a string representing a Path, " "a file object, or an object implementing .seek and .read. " "Passing a PdfReader directly works as well." @@ -2894,14 +2895,12 @@ def add_filtered_articles( def _get_cloned_page( self, - page: Union[None, int, IndirectObject, PageObject, NullObject], + page: Union[None, IndirectObject, PageObject, NullObject], pages: Dict[int, PageObject], reader: PdfReader, ) -> Optional[IndirectObject]: if isinstance(page, NullObject): return None - if isinstance(page, int): - _i = reader.pages[page].indirect_reference elif isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": _i = page.indirect_reference elif isinstance(page, IndirectObject): @@ -3084,13 +3083,12 @@ def find_bookmark( self, outline_item: Dict[str, Any], root: Optional[OutlineType] = None, - ) -> Optional[List[int]]: # deprecated + ) -> None: # deprecated """ .. deprecated:: 2.9.0 Use :meth:`find_outline_item` instead. """ - deprecate_with_replacement("find_bookmark", "find_outline_item", "5.0.0") - return self.find_outline_item(outline_item, root) + deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0") def reset_translation( self, reader: Union[None, PdfReader, IndirectObject] = None diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py index 98a222483..580b8bf58 100644 --- a/pypdf/annotations/_markup_annotations.py +++ b/pypdf/annotations/_markup_annotations.py @@ -50,7 +50,7 @@ class MarkupAnnotation(AnnotationDictionary, ABC): def __init__(self, *, title_bar: Optional[str] = None): if title_bar is not None: - self[NameObject("T")] = TextStringObject(title_bar) + self[NameObject("/T")] = TextStringObject(title_bar) class Text(MarkupAnnotation): diff --git a/pypdf/constants.py b/pypdf/constants.py index d7a8e310f..89fb55359 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -14,8 +14,6 @@ from enum import IntFlag, auto from typing import Dict, Tuple -from ._utils import classproperty, deprecate_with_replacement - class Core: """Keywords that don't quite belong anywhere else.""" @@ -162,46 +160,6 @@ class Ressources: # deprecated .. deprecated:: 5.0.0 """ - @classproperty - def EXT_G_STATE(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/ExtGState" # dictionary, optional - - @classproperty - def COLOR_SPACE(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/ColorSpace" # dictionary, optional - - @classproperty - def PATTERN(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/Pattern" # dictionary, optional - - @classproperty - def SHADING(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/Shading" # dictionary, optional - - @classproperty - def XOBJECT(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/XObject" # dictionary, optional - - @classproperty - def FONT(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/Font" # dictionary, optional - - @classproperty - def PROC_SET(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/ProcSet" # array, optional - - @classproperty - def PROPERTIES(cls) -> str: # noqa: N805 - deprecate_with_replacement("Ressources", "Resources", "5.0.0") - return "/Properties" # dictionary, optional - class PagesAttributes: """§7.7.3.2 of the 1.7 and 2.0 reference.""" diff --git a/pypdf/filters.py b/pypdf/filters.py index 43730cc8e..7589c8051 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -44,7 +44,6 @@ from ._utils import ( WHITESPACES_AS_BYTES, deprecate, - deprecate_with_replacement, deprecation_no_replacement, logger_warning, ord_, @@ -118,9 +117,6 @@ def decode( Raises: PdfReadError: """ - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] if isinstance(decode_parms, ArrayObject): raise DeprecationError("decode_parms as ArrayObject is depreciated") @@ -611,9 +607,6 @@ def decode( **kwargs: Any, ) -> bytes: # decode_parms is unused here - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] if isinstance(decode_parms, ArrayObject): # deprecated deprecation_no_replacement( "decode_parms being an ArrayObject", removed_in="3.15.5" @@ -729,12 +722,6 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject return data -def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated - """Deprecated. Use decode_stream_data.""" - deprecate_with_replacement("decodeStreamData", "decode_stream_data", "4.0.0") - return decode_stream_data(stream) - - def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]: """ Users need to have the pillow package installed. diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 48045e0a6..63ccf1bdc 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -29,9 +29,11 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union -from .._utils import StreamType, deprecate_with_replacement +from .._utils import ( + deprecation_with_replacement, +) from ..constants import OutlineFontFlag from ._base import ( BooleanObject, @@ -69,41 +71,10 @@ ) from ._viewerpref import ViewerPreferences - -def readHexStringFromStream( - stream: StreamType, -) -> Union["TextStringObject", "ByteStringObject"]: # deprecated - """Deprecated, use read_hex_string_from_stream.""" - deprecate_with_replacement( - "readHexStringFromStream", "read_hex_string_from_stream", "4.0.0" - ) - return read_hex_string_from_stream(stream) - - -def readStringFromStream( - stream: StreamType, - forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, -) -> Union["TextStringObject", "ByteStringObject"]: # deprecated - """Deprecated, use read_string_from_stream.""" - deprecate_with_replacement( - "readStringFromStream", "read_string_from_stream", "4.0.0" - ) - return read_string_from_stream(stream, forced_encoding) - - -def createStringObject( - string: Union[str, bytes], - forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, -) -> Union[TextStringObject, ByteStringObject]: # deprecated - """Deprecated, use create_string_object.""" - deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0") - return create_string_object(string, forced_encoding) - - PAGE_FIT = Fit.fit() -class AnnotationBuilder: +class AnnotationBuilder: # deprecated """ The AnnotationBuilder is deprecated. @@ -121,26 +92,10 @@ def text( text: str, open: bool = False, flags: int = 0, - ) -> DictionaryObject: - """ - Add text annotation. - - Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - text: The text that is added to the document - open: - flags: - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.text", "pypdf.annotations.Text", "4.0.0" ) - from ..annotations import Text - - return Text(rect=rect, text=text, open=open, flags=flags) @staticmethod def free_text( @@ -153,43 +108,10 @@ def free_text( font_color: str = "000000", border_color: Optional[str] = "000000", background_color: Optional[str] = "ffffff", - ) -> DictionaryObject: - """ - Add text in a rectangle to a page. - - Args: - text: Text to be added - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - font: Name of the Font, e.g. 'Helvetica' - bold: Print the text in bold - italic: Print the text in italic - font_size: How big the text will be, e.g. '14pt' - font_color: Hex-string for the color, e.g. cdcdcd - border_color: Hex-string for the border color, e.g. cdcdcd. - Use ``None`` for no border. - background_color: Hex-string for the background of the annotation, - e.g. cdcdcd. Use ``None`` for transparent background. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.free_text", "pypdf.annotations.FreeText", "4.0.0" ) - from ..annotations import FreeText - - return FreeText( - text=text, - rect=rect, - font=font, - bold=bold, - italic=italic, - font_size=font_size, - font_color=font_color, - background_color=background_color, - border_color=border_color, - ) @staticmethod def popup( @@ -198,34 +120,10 @@ def popup( flags: int = 0, parent: Optional[DictionaryObject] = None, open: bool = False, - ) -> DictionaryObject: - """ - Add a popup to the document. - - Args: - rect: - Specifies the clickable rectangular area as `[xLL, yLL, xUR, yUR]` - flags: - 1 - invisible, 2 - hidden, 3 - print, 4 - no zoom, - 5 - no rotate, 6 - no view, 7 - read only, 8 - locked, - 9 - toggle no view, 10 - locked contents - open: - Whether the popup should be shown directly (default is False). - parent: - The contents of the popup. Create this via the AnnotationBuilder. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.popup", "pypdf.annotations.Popup", "4.0.0" ) - from ..annotations import Popup - - popup = Popup(rect=rect, open=open, parent=parent) - popup.flags = flags # type: ignore - - return popup @staticmethod def line( @@ -234,74 +132,27 @@ def line( rect: Union[RectangleObject, Tuple[float, float, float, float]], text: str = "", title_bar: Optional[str] = None, - ) -> DictionaryObject: - """ - Draw a line on the PDF. - - Args: - p1: First point - p2: Second point - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - text: Text to be displayed as the line annotation - title_bar: Text to be displayed in the title bar of the - annotation; by convention this is the name of the author - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.line", "pypdf.annotations.Line", "4.0.0" ) - from ..annotations import Line - - return Line(p1=p1, p2=p2, rect=rect, text=text, title_bar=title_bar) @staticmethod def polyline( vertices: List[Tuple[float, float]], - ) -> DictionaryObject: - """ - Draw a polyline on the PDF. - - Args: - vertices: Array specifying the vertices (x, y) coordinates of the poly-line. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.polyline", "pypdf.annotations.PolyLine", "4.0.0" ) - from ..annotations import PolyLine - - return PolyLine(vertices=vertices) @staticmethod def rectangle( rect: Union[RectangleObject, Tuple[float, float, float, float]], interiour_color: Optional[str] = None, - ) -> DictionaryObject: - """ - Draw a rectangle on the PDF. - - This method uses the /Square annotation type of the PDF format. - - Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - interiour_color: None or hex-string for the color, e.g. cdcdcd - If None is used, the interiour is transparent. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.rectangle", "pypdf.annotations.Rectangle", "4.0.0" ) - from ..annotations import Rectangle - - return Rectangle(rect=rect, interiour_color=interiour_color) @staticmethod def highlight( @@ -310,65 +161,25 @@ def highlight( quad_points: ArrayObject, highlight_color: str = "ff0000", printing: bool = False, - ) -> DictionaryObject: - """ - Add a highlight annotation to the document. - - Args: - rect: Array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the highlighted area - quad_points: An ArrayObject of 8 FloatObjects. Must match a word or - a group of words, otherwise no highlight will be shown. - highlight_color: The color used for the highlight. - printing: Whether to print out the highlight annotation when the page - is printed. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.highlight", "pypdf.annotations.Highlight", "4.0.0" ) - from ..annotations import Highlight - - return Highlight( - rect=rect, quad_points=quad_points, highlight_color=highlight_color, printing=printing - ) @staticmethod def ellipse( rect: Union[RectangleObject, Tuple[float, float, float, float]], interiour_color: Optional[str] = None, - ) -> DictionaryObject: - """ - Draw an ellipse on the PDF. - - This method uses the /Circle annotation type of the PDF format. - - Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying - the bounding box of the ellipse - interiour_color: None or hex-string for the color, e.g. cdcdcd - If None is used, the interiour is transparent. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.ellipse", "pypdf.annotations.Ellipse", "4.0.0" ) - from ..annotations import Ellipse - - return Ellipse(rect=rect, interiour_color=interiour_color) @staticmethod - def polygon(vertices: List[Tuple[float, float]]) -> DictionaryObject: - deprecate_with_replacement( + def polygon(vertices: List[Tuple[float, float]]) -> None: + deprecation_with_replacement( "AnnotationBuilder.polygon", "pypdf.annotations.Polygon", "4.0.0" ) - from ..annotations import Polygon - - return Polygon(vertices=vertices) from ._fit import DEFAULT_FIT @@ -379,45 +190,10 @@ def link( url: Optional[str] = None, target_page_index: Optional[int] = None, fit: Fit = DEFAULT_FIT, - ) -> DictionaryObject: - """ - Add a link to the document. - - The link can either be an external link or an internal link. - - An external link requires the URL parameter. - An internal link requires the target_page_index, fit, and fit args. - - Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - border: if provided, an array describing border-drawing - properties. See the PDF spec for details. No border will be - drawn if this argument is omitted. - - horizontal corner radius, - - vertical corner radius, and - - border width - - Optionally: Dash - url: Link to a website (if you want to make an external link) - target_page_index: index of the page to which the link should go - (if you want to make an internal link) - fit: Page fit or 'zoom' option. - - Returns: - A dictionary object representing the annotation. - """ - deprecate_with_replacement( + ) -> None: + deprecation_with_replacement( "AnnotationBuilder.link", "pypdf.annotations.Link", "4.0.0" ) - from ..annotations import Link - - return Link( - rect=rect, - border=border, - url=url, - target_page_index=target_page_index, - fit=fit, - ) __all__ = [ diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 08bc2806d..215f2c75e 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -52,8 +52,8 @@ from .._utils import ( WHITESPACES, StreamType, - deprecate_no_replacement, - deprecate_with_replacement, + deprecation_no_replacement, + deprecation_with_replacement, logger_warning, read_non_whitespace, read_until_regex, @@ -207,7 +207,7 @@ def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated - deprecate_no_replacement( + deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"[") @@ -464,7 +464,7 @@ def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated - deprecate_no_replacement( + deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<<\n") @@ -634,10 +634,6 @@ def __init__(self, dct: Optional[DictionaryObject] = None) -> None: if dct: self.update(dct) - def hasChildren(self) -> bool: # deprecated - deprecate_with_replacement("hasChildren", "has_children", "4.0.0") - return self.has_children() - def has_children(self) -> bool: return "/First" in self @@ -827,10 +823,6 @@ def remove_from_tree(self) -> None: else: cast("TreeObject", self["/Parent"]).remove_child(self) - def emptyTree(self) -> None: # deprecated - deprecate_with_replacement("emptyTree", "empty_tree", "4.0.0") - self.empty_tree() - def empty_tree(self) -> None: for child in self: child_obj = child.get_object() @@ -921,7 +913,7 @@ def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated - deprecate_no_replacement( + deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) @@ -932,13 +924,10 @@ def write_to_stream( stream.write(b"\nendstream") @staticmethod - def initializeFromDictionary( - data: Dict[str, Any] - ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: - deprecate_with_replacement( + def initializeFromDictionary(data: Dict[str, Any]) -> None: + deprecation_with_replacement( "initializeFromDictionary", "initialize_from_dictionary", "5.0.0" ) # pragma: no cover - return StreamObject.initialize_from_dictionary(data) # pragma: no cover @staticmethod def initialize_from_dictionary( @@ -1041,7 +1030,7 @@ def get_data(self) -> bytes: return decoded.get_data() # This overrides the parent method: - def set_data(self, data: bytes) -> None: # deprecated + def set_data(self, data: bytes) -> None: from ..filters import FlateDecode if self.get(SA.FILTER, "") in (FT.FLATE_DECODE, [FT.FLATE_DECODE]): @@ -1561,7 +1550,7 @@ def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if encryption_key is not None: # deprecated - deprecate_no_replacement( + deprecation_no_replacement( "the encryption_key parameter of write_to_stream", "5.0.0" ) stream.write(b"<<\n") diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index 8a04001ddae371fa756d1dc2f607fd42965f0f8f..99da2bb864ca04af9bf493db55843d1b45dc5330 100644 GIT binary patch delta 2685 zcmZ`*U2GIp6wbX!I|LijB5?m_MhmeRYQRV%Xi=l`yU3EVO$sEmQ8zLE zL@8zKeYkC7Vy)1J#)!I-LERdRkA@ec5@R5mh>0fpLL)pFgP+C=)|M7oe^mCPmX4wriIjfwciX>v_>z7fB4u;ZZT@PAb%+1W~!Za|CH{ey~ z++MsHGZNCj@c;_%T!F^w;4ymlZn$M7`YFJSn)e>Vdm^;ccRX0tteQnM{Hq83=IA$1 zr_DRlxGDVi8Pt_VOdl!jZep^+hPzBvo`9SFz0^_%Xb_X*BqE2IOz3?1IX)VGmPLg} zF;f+Wh?$XPXnlC)Jo<3~W?sT?R|&22bU40T7am!xFCyRkd>!2Zv7J-c*Dp@C>1_iI z;Zw8dR|j!Zn8t14!JB9zt;fzZ>U?SY%GiK82|2H($<0@XP)i-~jYULUzl2=OS4iZbP{IbvAE?i#lTt&VA}*o}{4^FJZFc^q^^(t)&`*(}O6SzJZ|?MQrs%cCjTG0voNU6%e|spPGU|c3n6AFuF*geCqOOdm6MiyRk@B9ntAsv0wAqY4ha2A> zuQP=ye3u>`U5KUBSBKUvm1TLJ=zB7k-QRa0<4R9Aog$4gvAT+z)O${SfThxlhr7*< z7jWDkD~qoWJIGu;3y=5DqLiLE{RWnSZY#8#fhhpN;Cg*&ytO!7xV#W666vufW}v7x zqqkx3!zgJ~Ngq`hK*Bv^-(eNozYHp6^^x&rthhck-VV2@@0S)Yjh}-$V8eVJsXv^& zO`Sg{p-v(}RUKI*?`M%Y;X5nzM`vEa%G2U(yBX_n=J#`7ar7*21WU}wG+tAjy>P;* zKhd8#m?EwUOm3U=be_R68C`D9O~pS55gd%MFsWdSEKITg!F)THDi2n#&80r4(Eo7d zD}_1nBplY!HBW{UpV&n>2^2I@Z5$=h;aH0?;z1W$7!7K2^YEHWpV%sJMWXLWO|tYQ zvv;Bh^=#6fq{1fcNlxtbJjG?+X6OZ8U}HWZ)ZSeXNNSOxgi@KvI#CblX!Pbl}5vUW@blYu2GV9+%+ z$%G2<$tEZyi%zJjBA^~6fgPtPlOk^iGH!P(2t#in2x3_~A=Hm_wbt`Fv#q0nFTA;4 z$d|_sz+1+N27&S|-x+}gYb7KRCahinj7Mj1wF-a|iE3F2C(|+Uog`rI%5RuP-uQHIP_NoFI!DJWPF!S0^Md+%q0<7yN zcf#p@HjhdU|8}waJ7j{OFtlU*(XoAd#`lf}Qo@>U#@nSz@~mB37#-U?{>1(QZh5Jj zy+fsdl_j7VIf-1E>ao#WGAR}@NR_-!0e}_GseT-n+H?p$%?d2#o;S{ zyZl9g`Bln5BfsMDcER=$IBm5jgV{?YAPTlS(nmBr?UK>7Bgf7@^vdvK z@Kz5k!aswwb7t524OE4&ThPI?4_v=K2REJhf}TA4!LL6=@W&v%58f}6b$-cgnS}5+ z$Bka=7`3KtRPz+v{sFSN2*U@d2iK!z1C9!Gs`O9_PQ#y$@vV%950UTYP#pOgxOR<9#Gf4@gCWeHtmE{P9%uM@4X5`) zDgO98`Gp}76D3xOzqvq86=CZCP^;2#%Ckcl>n^HH(RvzWR8N$*5~G z>KZtDm@Zvt>Me*~dX-UI)4cp5{e2O|>*U8>y*8t`%_wdSOdqDf-OY&W;7wp_ zHn24t*qROAn53_-f-4o)3A<6M#0WMqf;Z}eBwliX9&dvaypA1NuErx@(wnP@1+@CT zDbKhk;rT=7qjAjVxEl5I$^DAi8&eJEseiw<6y-@jeg8*##od;MDa-DEtXv;I7gDRG0DA^ zaxoER_EvFPnb74~P{u`>g?^poEfp&(q889r*_M1(Yjk-Qj16_QHTXZ(MZ8546^RJ% z+HAr|^Ok&(<@rLiFj0+}>Mh+hGLuKGrBE%KrSjj~%)vTDq4F^&{XLys=d^E&rWo=c z^2lERO+k#wR&%+~w8-SbP-PQx?p)|nb2YpbNhw=n+K6iQU?D`5*GULv@)`>nx-eIr zkdFJ$UvnWsMm!H%C@ikdV71nE7<756g+j;W4^c+u-7gGwM6Shz5xmx7w^$IyaFy$( zDh8#^A7U)mb%b?*CbTZ6u?UsPdjVtC qlcQQTOD24ymWrCaASO?yP#qDfhfl-BWdE>TCh5rD_?{^6R diff --git a/tests/test_annotations.py b/tests/test_annotations.py index 2064ed402..f6d14c5e4 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -2,8 +2,23 @@ from pathlib import Path +import pytest + from pypdf import PdfReader, PdfWriter -from pypdf.annotations import Ellipse, FreeText, Rectangle, Text +from pypdf.annotations import ( + AnnotationDictionary, + Ellipse, + FreeText, + Highlight, + Line, + Link, + Polygon, + PolyLine, + Popup, + Rectangle, + Text, +) +from pypdf.generic import ArrayObject, FloatObject, NumberObject TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -19,6 +34,12 @@ def test_ellipse_annotation(pdf_file_path): writer.add_page(page) # Act + with pytest.warns(DeprecationWarning): + ellipse_annotation = Ellipse( + rect=(50, 550, 500, 650), + interiour_color="ff0000", + ) + ellipse_annotation = Ellipse( rect=(50, 550, 500, 650), interior_color="ff0000", @@ -109,3 +130,250 @@ def test_free_text_annotation(pdf_file_path): # Assert: You need to inspect the file manually with open(pdf_file_path, "wb") as fp: writer.write(fp) + + +def test_annotationdictionary(): + a = AnnotationDictionary() + a.flags = 123 + assert a.flags == 123 + + +def test_polygon(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + with pytest.raises(ValueError): + Polygon( + vertices=[], + ) + + annotation = Polygon( + vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], + ) + writer.add_annotation(0, annotation) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_polyline(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + with pytest.raises(ValueError): + PolyLine( + vertices=[], + ) + + annotation = PolyLine( + vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], + ) + writer.add_annotation(0, annotation) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_line(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + line_annotation = Line( + text="Hello World\nLine2", + rect=(50, 550, 200, 650), + p1=(50, 550), + p2=(200, 650), + ) + writer.add_annotation(0, line_annotation) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_square(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + with pytest.warns(DeprecationWarning): + square_annotation = Rectangle( + rect=(50, 550, 200, 650), interiour_color="ff0000" + ) + + square_annotation = Rectangle(rect=(50, 550, 200, 650), interior_color="ff0000") + writer.add_annotation(0, square_annotation) + + square_annotation = Rectangle( + rect=(40, 400, 150, 450), + ) + writer.add_annotation(0, square_annotation) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_highlight(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + highlight_annotation = Highlight( + rect=(95.79332, 704.31777, 138.55779, 724.6855), + highlight_color="ff0000", + quad_points=ArrayObject( + [ + FloatObject(100.060779), + FloatObject(723.55398), + FloatObject(134.29033), + FloatObject(723.55398), + FloatObject(100.060779), + FloatObject(705.4493), + FloatObject(134.29033), + FloatObject(705.4493), + ] + ), + printing=False, + ) + writer.add_annotation(0, highlight_annotation) + for annot in writer.pages[0]["/Annots"]: + obj = annot.get_object() + subtype = obj["/Subtype"] + if subtype == "/Highlight": + assert "/F" not in obj or obj["/F"] == NumberObject(0) + + writer.add_page(page) + # Act + highlight_annotation = Highlight( + rect=(95.79332, 704.31777, 138.55779, 724.6855), + highlight_color="ff0000", + quad_points=ArrayObject( + [ + FloatObject(100.060779), + FloatObject(723.55398), + FloatObject(134.29033), + FloatObject(723.55398), + FloatObject(100.060779), + FloatObject(705.4493), + FloatObject(134.29033), + FloatObject(705.4493), + ] + ), + printing=True, + ) + writer.add_annotation(1, highlight_annotation) + for annot in writer.pages[1]["/Annots"]: + obj = annot.get_object() + subtype = obj["/Subtype"] + if subtype == "/Highlight": + assert obj["/F"] == NumberObject(4) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_link(pdf_file_path): + # Arrange + pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + # Part 1: Too many args + with pytest.raises(ValueError): + Link( + rect=(50, 550, 200, 650), + url="https://martin-thoma.com/", + target_page_index=3, + ) + + # Part 2: Too few args + with pytest.raises(ValueError): + Link( + rect=(50, 550, 200, 650), + ) + + # Part 3: External Link + link_annotation = Link( + rect=(50, 50, 100, 100), + url="https://martin-thoma.com/", + border=[1, 0, 6, [3, 2]], + ) + writer.add_annotation(0, link_annotation) + + # Part 4: Internal Link + link_annotation = Link( + rect=(100, 100, 300, 200), + target_page_index=1, + border=[50, 10, 4], + ) + writer.add_annotation(0, link_annotation) + + for page in reader.pages[1:]: + writer.add_page(page) + + # Assert: You need to inspect the file manually + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_popup(caplog): + # Arrange + pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + text_annotation = Text( + title_bar="hello world", + text="Hello World\nThis is the second line!", + rect=(50, 550, 200, 650), + open=True, + ) + ta = writer.add_annotation(0, text_annotation) + popup_annotation = Popup( + rect=(50, 550, 200, 650), + open=True, + parent=ta, # prefer to use for evolutivity + ) + writer.add_annotation(writer.pages[0], popup_annotation) + + Popup( + rect=(50, 550, 200, 650), + open=True, + parent=True, # broken parameter # type: ignore + ) + assert "Unregistered Parent object : No Parent field set" in caplog.text + + target = "annotated-pdf-popup.pdf" + writer.write(target) + Path(target).unlink() # comment this out for manual inspection diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 39ee17453..f5c494cb9 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -171,7 +171,7 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password): @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_encrypted_pdfs(names): """Encrypted PDFs can be merged after decryption.""" - merger = pypdf.PdfMerger() + merger = pypdf.PdfWriter() files = [RESOURCE_ROOT / "encryption" / x for x in names] pdfs = [pypdf.PdfReader(x) for x in files] for pdf in pdfs: diff --git a/tests/test_generic.py b/tests/test_generic.py index 190bb25e6..a13aa7b09 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -5,15 +5,13 @@ from copy import deepcopy from io import BytesIO from pathlib import Path -from unittest.mock import patch import pytest -from pypdf import PdfMerger, PdfReader, PdfWriter +from pypdf import PdfReader, PdfWriter from pypdf.constants import CheckboxRadioButtonAttributes from pypdf.errors import PdfReadError, PdfStreamError from pypdf.generic import ( - AnnotationBuilder, ArrayObject, BooleanObject, ByteStringObject, @@ -701,387 +699,27 @@ def test_bool_repr(tmp_path): @pytest.mark.enable_socket() -@patch("pypdf._reader.logger_warning") -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_issue_997(mock_logger_warning, pdf_file_path): +def test_issue_997(pdf_file_path): url = ( "https://github.com/py-pdf/pypdf/files/8908874/" "Exhibit_A-2_930_Enterprise_Zone_Tax_Credits_final.pdf" ) name = "gh-issue-997.pdf" - merger = PdfMerger() + merger = PdfWriter() merger.append(BytesIO(get_data_from_url(url, name=name))) # here the error raises with open(pdf_file_path, "wb") as f: merger.write(f) merger.close() - mock_logger_warning.assert_called_with("Overwriting cache for 0 4", "pypdf._reader") - # Strict - merger = PdfMerger(strict=True) - with pytest.raises(PdfReadError) as exc: - merger.append( - BytesIO(get_data_from_url(url, name=name)) - ) # here the error raises - assert exc.value.args[0] == "Could not find object." + merger = PdfWriter() + merger.append(BytesIO(get_data_from_url(url, name=name))) # here the error raises with open(pdf_file_path, "wb") as f: merger.write(f) merger.close() -def test_annotation_builder_free_text(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - free_text_annotation = AnnotationBuilder.free_text( - "Hello World - bold and italic\nThis is the second line!", - rect=(50, 550, 200, 650), - font="Arial", - bold=True, - italic=True, - font_size="20pt", - font_color="00ff00", - border_color=None, - background_color=None, - ) - writer.add_annotation(0, free_text_annotation) - - with pytest.warns(DeprecationWarning): - free_text_annotation = AnnotationBuilder.free_text( - "Another free text annotation (not bold, not italic)", - rect=(500, 550, 200, 650), - font="Arial", - bold=False, - italic=False, - font_size="20pt", - font_color="00ff00", - border_color="0000ff", - background_color="cdcdcd", - ) - writer.add_annotation(0, free_text_annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_polygon(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning), pytest.raises(ValueError) as exc: - AnnotationBuilder.polygon( - vertices=[], - ) - assert exc.value.args[0] == "A polygon needs at least 1 vertex with two coordinates" - - with pytest.warns(DeprecationWarning): - annotation = AnnotationBuilder.polygon( - vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], - ) - writer.add_annotation(0, annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_polyline(pdf_file_path, pdf_reader_page): - # Arrange - writer = PdfWriter() - writer.add_page(pdf_reader_page) - - # Act - with pytest.warns(DeprecationWarning), pytest.raises(ValueError) as exc: - AnnotationBuilder.polyline( - vertices=[], - ) - assert exc.value.args[0] == "A polygon needs at least 1 vertex with two coordinates" - - with pytest.warns(DeprecationWarning): - annotation = AnnotationBuilder.polyline( - vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], - ) - writer.add_annotation(0, annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_line(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - line_annotation = AnnotationBuilder.line( - text="Hello World\nLine2", - rect=(50, 550, 200, 650), - p1=(50, 550), - p2=(200, 650), - ) - writer.add_annotation(0, line_annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_square(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - square_annotation = AnnotationBuilder.rectangle( - rect=(50, 550, 200, 650), interiour_color="ff0000" - ) - writer.add_annotation(0, square_annotation) - - with pytest.warns(DeprecationWarning): - square_annotation = AnnotationBuilder.rectangle( - rect=(40, 400, 150, 450), - ) - writer.add_annotation(0, square_annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_highlight(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - highlight_annotation = AnnotationBuilder.highlight( - rect=(95.79332, 704.31777, 138.55779, 724.6855), - highlight_color="ff0000", - quad_points=ArrayObject( - [ - FloatObject(100.060779), - FloatObject(723.55398), - FloatObject(134.29033), - FloatObject(723.55398), - FloatObject(100.060779), - FloatObject(705.4493), - FloatObject(134.29033), - FloatObject(705.4493), - ] - ), - printing=False, - ) - writer.add_annotation(0, highlight_annotation) - for annot in writer.pages[0]["/Annots"]: - obj = annot.get_object() - subtype = obj["/Subtype"] - if subtype == "/Highlight": - assert "/F" not in obj or obj["/F"] == NumberObject(0) - - writer.add_page(page) - # Act - with pytest.warns(DeprecationWarning): - highlight_annotation = AnnotationBuilder.highlight( - rect=(95.79332, 704.31777, 138.55779, 724.6855), - highlight_color="ff0000", - quad_points=ArrayObject( - [ - FloatObject(100.060779), - FloatObject(723.55398), - FloatObject(134.29033), - FloatObject(723.55398), - FloatObject(100.060779), - FloatObject(705.4493), - FloatObject(134.29033), - FloatObject(705.4493), - ] - ), - printing=True, - ) - writer.add_annotation(1, highlight_annotation) - for annot in writer.pages[1]["/Annots"]: - obj = annot.get_object() - subtype = obj["/Subtype"] - if subtype == "/Highlight": - assert obj["/F"] == NumberObject(4) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_circle(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - circle_annotation = AnnotationBuilder.ellipse( - rect=(50, 550, 200, 650), interiour_color="ff0000" - ) - writer.add_annotation(0, circle_annotation) - - diameter = 100 - with pytest.warns(DeprecationWarning): - circle_annotation = AnnotationBuilder.ellipse( - rect=(110, 500, 110 + diameter, 500 + diameter), - ) - writer.add_annotation(0, circle_annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_link(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - # Part 1: Too many args - with pytest.warns(DeprecationWarning), pytest.raises(ValueError) as exc: - AnnotationBuilder.link( - rect=(50, 550, 200, 650), - url="https://martin-thoma.com/", - target_page_index=3, - ) - assert exc.value.args[0] == ( - "Either 'url' or 'target_page_index' have to be provided. " - "url='https://martin-thoma.com/', target_page_index=3" - ) - - # Part 2: Too few args - with pytest.warns(DeprecationWarning), pytest.raises(ValueError) as exc: - AnnotationBuilder.link( - rect=(50, 550, 200, 650), - ) - assert ( - exc.value.args[0] - == "Either 'url' or 'target_page_index' have to be provided. Both were None." - ) - - # Part 3: External Link - with pytest.warns(DeprecationWarning): - link_annotation = AnnotationBuilder.link( - rect=(50, 50, 100, 100), - url="https://martin-thoma.com/", - border=[1, 0, 6, [3, 2]], - ) - writer.add_annotation(0, link_annotation) - - # Part 4: Internal Link - with pytest.warns(DeprecationWarning): - link_annotation = AnnotationBuilder.link( - rect=(100, 100, 300, 200), - target_page_index=1, - border=[50, 10, 4], - ) - writer.add_annotation(0, link_annotation) - - for page in reader.pages[1:]: - writer.add_page(page) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_text(pdf_file_path): - # Arrange - pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - text_annotation = AnnotationBuilder.text( - text="Hello World\nThis is the second line!", - rect=(50, 550, 500, 650), - open=True, - ) - writer.add_annotation(0, text_annotation) - - # Assert: You need to inspect the file manually - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_annotation_builder_popup(caplog): - # Arrange - pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - # Act - with pytest.warns(DeprecationWarning): - text_annotation = AnnotationBuilder.text( - text="Hello World\nThis is the second line!", - rect=(50, 550, 200, 650), - open=True, - ) - ta = writer.add_annotation(0, text_annotation) - - with pytest.warns(DeprecationWarning): - popup_annotation = AnnotationBuilder.popup( - rect=(50, 550, 200, 650), - open=True, - parent=ta, # prefer to use for evolutivity - ) - - assert caplog.text == "" - with pytest.warns(DeprecationWarning): - AnnotationBuilder.popup( - rect=(50, 550, 200, 650), - open=True, - parent=True, # broken parameter # type: ignore - ) - assert "Unregistered Parent object : No Parent field set" in caplog.text - - writer.add_annotation(writer.pages[0], popup_annotation) - - target = "annotated-pdf-popup.pdf" - writer.write(target) - Path(target).unlink() # comment this out for manual inspection - - def test_checkboxradiobuttonattributes_opt(): assert "/Opt" in CheckboxRadioButtonAttributes.attributes_dict() @@ -1325,6 +963,13 @@ def test_encodedstream_set_data(): assert str(cc["/DecodeParms"]) == "[NullObject, NullObject, NullObject]" assert cc[NameObject("/Test")] == "/MyTest" + with pytest.raises(TypeError): + aa.set_data("toto") + + aa[NameObject("/Filter")] = NameObject("/JPXEncode") + with pytest.raises(PdfReadError): + aa.set_data(b"toto") + @pytest.mark.enable_socket() def test_set_data_2(): diff --git a/tests/test_merger.py b/tests/test_merger.py index 3d7917902..c9112eae3 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -7,6 +7,7 @@ import pypdf from pypdf import PdfMerger, PdfReader, PdfWriter +from pypdf.errors import DeprecationError from pypdf.generic import Destination, Fit from . import get_data_from_url @@ -36,7 +37,7 @@ def merger_operate(merger): data = fp.read() merger.append(data) assert exc.value.args[0].startswith( - "PdfMerger.merge requires an object that PdfReader can parse. " + "Merging requires an object that PdfReader can parse. " "Typically, that is a Path" ) @@ -156,21 +157,6 @@ def check_outline(tmp_path): tmp_filename = "dont_commit_merged.pdf" -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merger_operations_by_traditional_usage(tmp_path): - # Arrange - merger = PdfMerger() - merger_operate(merger) - path = tmp_path / tmp_filename - - # Act - merger.write(path) - merger.close() - - # Assert - check_outline(path) - - def test_merger_operations_by_traditional_usage_with_writer(tmp_path): # Arrange merger = PdfWriter() @@ -184,19 +170,6 @@ def test_merger_operations_by_traditional_usage_with_writer(tmp_path): check_outline(path) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merger_operations_by_semi_traditional_usage(tmp_path): - path = tmp_path / tmp_filename - - with PdfMerger() as merger: - merger_operate(merger) - merger.write(path) # Act - - # Assert - assert Path(path).is_file() - check_outline(path) - - def test_merger_operations_by_semi_traditional_usage_with_writer(tmp_path): path = tmp_path / tmp_filename @@ -209,16 +182,6 @@ def test_merger_operations_by_semi_traditional_usage_with_writer(tmp_path): check_outline(path) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merger_operation_by_new_usage(tmp_path): - path = tmp_path / tmp_filename - with PdfMerger(fileobj=path) as merger: - merger_operate(merger) - # Assert - assert Path(path).is_file() - check_outline(path) - - def test_merger_operation_by_new_usage_with_writer(tmp_path): path = tmp_path / tmp_filename with PdfWriter(fileobj=path) as merger: @@ -229,16 +192,6 @@ def test_merger_operation_by_new_usage_with_writer(tmp_path): check_outline(path) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merge_page_exception(): - merger = pypdf.PdfMerger() - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - with pytest.raises(TypeError) as exc: - merger.merge(0, pdf_path, pages="a:b") - assert exc.value.args[0] == '"pages" must be a tuple of (start, stop[, step])' - merger.close() - - def test_merge_page_exception_with_writer(): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -251,14 +204,6 @@ def test_merge_page_exception_with_writer(): merger.close() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merge_page_tuple(): - merger = pypdf.PdfMerger() - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - merger.merge(0, pdf_path, pages=(0, 1)) - merger.close() - - def test_merge_page_tuple_with_writer(): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -266,44 +211,6 @@ def test_merge_page_tuple_with_writer(): merger.close() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_merge_write_closed_fh(): - merger = pypdf.PdfMerger() - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - merger.append(pdf_path) - - err_closed = "close() was called and thus the writer cannot be used anymore" - - merger.close() - with pytest.raises(RuntimeError) as exc: - merger.write("test_merge_write_closed_fh.pdf") - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger.add_metadata({"author": "Martin Thoma"}) - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger.set_page_layout("/SinglePage") - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger.page_mode = "/UseNone" - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger._write_outline() - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger.add_outline_item("An outline item", 0) - assert exc.value.args[0] == err_closed - - with pytest.raises(RuntimeError) as exc: - merger._write_dests() - assert exc.value.args[0] == err_closed - - def test_merge_write_closed_fh_with_writer(pdf_file_path): merger = pypdf.PdfWriter() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -317,18 +224,6 @@ def test_merge_write_closed_fh_with_writer(pdf_file_path): merger.add_outline_item("An outline item", 0) -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_trim_outline_list(pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" - name = "tika-995175.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - @pytest.mark.enable_socket() def test_trim_outline_list_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" @@ -336,18 +231,7 @@ def test_trim_outline_list_with_writer(pdf_file_path): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_zoom(pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" - name = "tika-994759.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) + merger.add_outline_item_dict(merger.outline[0]) merger.write(pdf_file_path) merger.close() @@ -365,12 +249,13 @@ def test_zoom_with_writer(pdf_file_path): @pytest.mark.enable_socket() @pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_zoom_xyz_no_left(pdf_file_path): +def test_zoom_xyz_no_left_with_add_page(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) + merger = PdfWriter() + for p in reader.pages: + merger.add_page(p) merger.write(pdf_file_path) merger.close() @@ -386,18 +271,6 @@ def test_zoom_xyz_no_left_with_writer(pdf_file_path): merger.close() -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_outline_item(pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" - name = "tika-997511.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - @pytest.mark.enable_socket() @pytest.mark.slow() def test_outline_item_with_writer(pdf_file_path): @@ -410,19 +283,6 @@ def test_outline_item_with_writer(pdf_file_path): merger.close() -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_trim_outline(pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" - name = "tika-982336.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - @pytest.mark.enable_socket() @pytest.mark.slow() def test_trim_outline_with_writer(pdf_file_path): @@ -435,19 +295,6 @@ def test_trim_outline_with_writer(pdf_file_path): merger.close() -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test1(pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" - name = "tika-923621.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - @pytest.mark.enable_socket() @pytest.mark.slow() def test1_with_writer(pdf_file_path): @@ -460,23 +307,6 @@ def test1_with_writer(pdf_file_path): merger.close() -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_sweep_recursion1(pdf_file_path): - # TODO: This test looks like an infinite loop. - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - reader2 = PdfReader(pdf_file_path) - reader2.pages - - @pytest.mark.enable_socket() @pytest.mark.slow() def test_sweep_recursion1_with_writer(pdf_file_path): @@ -493,34 +323,6 @@ def test_sweep_recursion1_with_writer(pdf_file_path): reader2.pages -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.parametrize( - ("url", "name"), - [ - ( - # TODO: This test looks like an infinite loop. - "https://corpora.tika.apache.org/base/docs/govdocs1/924/924794.pdf", - "tika-924794.pdf", - ), - ( - "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf", - "tika-924546.pdf", - ), - ], -) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_sweep_recursion2(url, name, pdf_file_path): - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - - reader2 = PdfReader(pdf_file_path) - reader2.pages - - @pytest.mark.enable_socket() @pytest.mark.slow() @pytest.mark.parametrize( @@ -548,22 +350,6 @@ def test_sweep_recursion2_with_writer(url, name, pdf_file_path): reader2.pages -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_sweep_indirect_list_newobj_is_none(caplog, pdf_file_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" - name = "tika-906769.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - merger.close() - # used to be: assert "Object 21 0 not defined." in caplog.text - - reader2 = PdfReader(pdf_file_path) - reader2.pages - - @pytest.mark.enable_socket() def test_sweep_indirect_list_newobj_is_none_with_writer(caplog, pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" @@ -579,17 +365,6 @@ def test_sweep_indirect_list_newobj_is_none_with_writer(caplog, pdf_file_path): reader2.pages -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_iss1145(): - # issue with FitH destination with null param - url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" - name = "iss1145.pdf" - merger = PdfMerger() - merger.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) - merger.close() - - @pytest.mark.enable_socket() def test_iss1145_with_writer(): # issue with FitH destination with null param @@ -600,22 +375,6 @@ def test_iss1145_with_writer(): merger.close() -@pytest.mark.enable_socket() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_iss1344(caplog): - url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" - name = "iss1344.pdf" - m = PdfMerger() - m.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) - b = BytesIO() - m.write(b) - r = PdfReader(b) - p = r.pages[0] - assert "/DIJMAC+Arial Black" in p._debug_for_extract() - assert "adresse où le malade peut être visité" in p.extract_text() - assert r.threads is None - - @pytest.mark.enable_socket() def test_iss1344_with_writer(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" @@ -643,5 +402,10 @@ def test_articles_with_writer(caplog): def test_deprecate_pdfmerger(): - with pytest.warns(DeprecationWarning), PdfMerger() as merger: + with pytest.raises(DeprecationError), PdfMerger() as merger: merger.append(RESOURCE_ROOT / "crazyones.pdf") + + +def test_get_reference(): + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f307271e7..77451ef99 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -14,7 +14,7 @@ from PIL import Image, ImageChops from PIL import __version__ as pil_version -from pypdf import PdfMerger, PdfReader, PdfWriter +from pypdf import PdfReader, PdfWriter from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PdfReadWarning from pypdf.generic import ( @@ -356,7 +356,7 @@ def test_overlay(pdf_file_path, base_path, overlay_path): def test_merge_with_warning(tmp_path, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) - merger = PdfMerger() + merger = PdfWriter() merger.append(reader) # This could actually be a performance bottleneck: merger.write(tmp_path / "tmp.merged.pdf") @@ -376,7 +376,7 @@ def test_merge_with_warning(tmp_path, url, name): def test_merge(tmp_path, url, name): data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) - merger = PdfMerger() + merger = PdfWriter() merger.append(reader) merger.write(tmp_path / "tmp.merged.pdf") @@ -585,7 +585,6 @@ def test_scale_rectangle_indirect_object(): page.scale(sx=2, sy=3) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_output(caplog): # Arrange base = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR.pdf" @@ -593,10 +592,8 @@ def test_merge_output(caplog): expected = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" # Act - merger = PdfMerger(strict=True) + merger = PdfWriter() merger.append(base) - msg = "Xref table not zero-indexed. ID numbers for objects will be corrected." - assert normalize_warnings(caplog.text) == [msg] merger.merge(1, crazy) stream = BytesIO() merger.write(stream) diff --git a/tests/test_writer.py b/tests/test_writer.py index e06db389b..a31c6f6bb 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -14,7 +14,6 @@ ImageType, ObjectDeletionFlag, PageObject, - PdfMerger, PdfReader, PdfWriter, Transformation, @@ -840,7 +839,7 @@ def test_sweep_indirect_references_nullobject_exception(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() + merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -864,11 +863,6 @@ def test_sweep_indirect_references_nullobject_exception(pdf_file_path): @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_some_appends(pdf_file_path, url, name): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - # PdfMerger - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - # PdfWriter merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) From 8ebd311a4088da81dbd59c7ecc7de13c4e86f595 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 14 Sep 2024 16:02:58 +0200 Subject: [PATCH 32/42] MAINT: Simplify test with None and NullObject (#2829) --- pypdf/_cmap.py | 7 ++++--- pypdf/_doc_common.py | 5 +++-- pypdf/_page.py | 21 +++++++++++++-------- pypdf/_page_labels.py | 19 ++++++++++++------- pypdf/_reader.py | 10 ++++++---- pypdf/_writer.py | 14 ++++++++------ pypdf/filters.py | 4 +--- pypdf/generic/__init__.py | 2 ++ pypdf/generic/_base.py | 10 ++++++++++ pypdf/generic/_data_structures.py | 11 +++++++---- pypdf/generic/_fit.py | 5 +++-- pypdf/generic/_viewerpref.py | 6 +++--- tests/test_generic.py | 16 ++++++++++++++++ 13 files changed, 88 insertions(+), 42 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 6c5996703..dcf3678bd 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -7,8 +7,8 @@ from .generic import ( DecodedStreamObject, DictionaryObject, - NullObject, StreamObject, + is_null_or_none, ) @@ -468,7 +468,7 @@ def compute_space_width( cpt += 1 sp_width = m / max(1, cpt) / 2 - if sp_width is None or isinstance(sp_width, NullObject): + if is_null_or_none(sp_width): sp_width = 0.0 return sp_width @@ -482,8 +482,9 @@ def type1_alternative( if "/FontDescriptor" not in ft: return map_dict, space_code, int_entry ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") - if ft_desc is None: + if is_null_or_none(ft_desc): return map_dict, space_code, int_entry + assert ft_desc is not None, "mypy" txt = ft_desc.get_object().get_data() txt = txt.split(b"eexec\n")[0] # only clear part txt = txt.split(b"/Encoding")[1] # to get the encoding part diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 8d07098b4..55c6aad67 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -85,6 +85,7 @@ TreeObject, ViewerPreferences, create_string_object, + is_null_or_none, ) from .types import OutlineType, PagemodeType from .xmp import XmpInformation @@ -761,7 +762,7 @@ def _get_inherited(obj: DictionaryObject, key: str) -> Any: field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore except Exception as exc: raise ValueError("field type is invalid") from exc - if _get_inherited(field, "/FT") is None: + if is_null_or_none(_get_inherited(field, "/FT")): raise ValueError("field is not valid") ret = [] if field.get("/Subtype", "") == "/Widget": @@ -852,7 +853,7 @@ def _get_outline( return outline # §12.3.3 Document outline, entries in the outline dictionary - if lines is not None and "/First" in lines: + if not is_null_or_none(lines) and "/First" in lines: node = cast(DictionaryObject, lines["/First"]) self._namedDests = self._get_named_destinations() diff --git a/pypdf/_page.py b/pypdf/_page.py index 471256eec..e4ec053c8 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -84,6 +84,7 @@ PdfObject, RectangleObject, StreamObject, + is_null_or_none, ) try: @@ -101,7 +102,7 @@ def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleOb retval: Union[None, RectangleObject, IndirectObject] = self.get(name) if isinstance(retval, RectangleObject): return retval - if retval is None: + if is_null_or_none(retval): for d in defaults: retval = self.get(d) if retval is not None: @@ -492,7 +493,8 @@ def __init__( self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] self.indirect_reference = indirect_reference - if indirect_reference is not None: + if not is_null_or_none(indirect_reference): + assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) def hash_bin(self) -> int: @@ -731,9 +733,10 @@ def _get_inline_images(self) -> Dict[str, ImageFile]: entries will be identified as ~1~ """ content = self.get_contents() - if content is None: + if is_null_or_none(content): return {} imgs_data = [] + assert content is not None, "mypy" for param, ope in content.operations: if ope == b"INLINE IMAGE": imgs_data.append( @@ -1063,7 +1066,7 @@ def replace_contents( for i in range(len(content)): content[i] = self.indirect_reference.pdf._add_object(content[i]) - if content is None: + if is_null_or_none(content): if PG.CONTENTS not in self: return else: @@ -1084,6 +1087,7 @@ def replace_contents( # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content else: + assert content is not None, "mypy" content.indirect_reference = self[ PG.CONTENTS ].indirect_reference # TODO: in a future may required generation management @@ -2218,10 +2222,11 @@ def extract_text( if extraction_mode not in ["plain", "layout"]: raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") if extraction_mode == "layout": - for visitor in ("visitor_operand_before", - "visitor_operand_after", - "visitor_text", - ): + for visitor in ( + "visitor_operand_before", + "visitor_operand_after", + "visitor_text", + ): if locals()[visitor]: logger_warning( f"Argument {visitor} is ignored in layout mode", diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index b02527950..1bedc003a 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -62,7 +62,13 @@ from ._protocols import PdfCommonDocProtocol from ._utils import logger_warning -from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject +from .generic import ( + ArrayObject, + DictionaryObject, + NullObject, + NumberObject, + is_null_or_none, +) def number2uppercase_roman_numeral(num: int) -> str: @@ -180,11 +186,13 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str: # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} limits = cast(List[int], kid["/Limits"]) if limits[0] <= index <= limits[1]: - if kid.get("/Kids", None) is not None: + if not is_null_or_none(kid.get("/Kids", None)): # Recursive definition. level += 1 if level == 100: # pragma: no cover - raise NotImplementedError("Too deep nesting is not supported.") + raise NotImplementedError( + "Too deep nesting is not supported." + ) number_tree = kid # Exit the inner `for` loop and continue at the next level with the # next iteration of the `while` loop. @@ -195,10 +203,7 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str: # and continue with the fallback. break - logger_warning( - f"Could not reliably determine page label for {index}.", - __name__ - ) + logger_warning(f"Could not reliably determine page label for {index}.", __name__) return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 58c160302..9948cbea3 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -79,6 +79,7 @@ PdfObject, StreamObject, TextStringObject, + is_null_or_none, read_object, ) from .xmp import XmpInformation @@ -206,11 +207,11 @@ def _info(self) -> Optional[DictionaryObject]: /Info Dictionary; None if the entry does not exist """ info = self.trailer.get(TK.INFO, None) - if info is None: + if is_null_or_none(info): return None else: info = info.get_object() - if info is None: + if info == None: # noqa: E711 raise PdfReadError( "Trailer not found or does not point to document information directory" ) @@ -225,7 +226,7 @@ def _ID(self) -> Optional[ArrayObject]: /ID array; None if the entry does not exist """ id = self.trailer.get(TK.ID, None) - return None if id is None else cast(ArrayObject, id.get_object()) + return None if is_null_or_none(id) else cast(ArrayObject, id.get_object()) def _repr_mimebundle_( self, @@ -298,8 +299,9 @@ def _get_page_number_by_indirect( x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore } - if indirect_reference is None or isinstance(indirect_reference, NullObject): + if is_null_or_none(indirect_reference): return None + assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" if isinstance(indirect_reference, int): idnum = indirect_reference else: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 1e6cb9e26..4d4cca329 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -107,6 +107,7 @@ ViewerPreferences, create_string_object, hex_to_rgb, + is_null_or_none, ) from .pagerange import PageRange, PageRangeSpec from .types import ( @@ -499,7 +500,7 @@ def _add_page( cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) self.flattened_pages.append(page) cpt = 1000 - while node is not None: + while not is_null_or_none(node): node = cast(DictionaryObject, node.get_object()) node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) node = node.get(PA.PARENT, None) @@ -612,8 +613,9 @@ def _get_page_number_by_indirect( The page number or None """ # to provide same function as in PdfReader - if indirect_reference is None or isinstance(indirect_reference, NullObject): + if is_null_or_none(indirect_reference): return None + assert indirect_reference is not None, "mypy" if isinstance(indirect_reference, int): indirect_reference = IndirectObject(indirect_reference, 0, self) obj = indirect_reference.get_object() @@ -928,7 +930,7 @@ def _update_field_annotation( ) dr = dr.get_object().get("/Font", DictionaryObject()).get_object() font_res = dr.get(font_name, None) - if font_res is not None: + if not is_null_or_none(font_res): font_res = cast(DictionaryObject, font_res.get_object()) font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 200, font_res @@ -1566,9 +1568,9 @@ def metadata(self) -> Optional[DocumentInformation]: Retrieve/set the PDF file's document information dictionary, if it exists. Args: - value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF. + value: dict with the entries to be set. if None : remove the /Info entry from the pdf. - Note that some PDF files use (XMP) metadata streams instead of document + Note that some PDF files use (xmp)metadata streams instead of document information dictionaries, and these metadata streams will not be accessed by this function. """ @@ -2981,7 +2983,7 @@ def _get_filtered_outline( if node is None: node = NullObject() node = node.get_object() - if node is None or isinstance(node, NullObject): + if is_null_or_none(node): node = DictionaryObject() if node.get("/Type", "") == "/Outlines" or "/Title" not in node: node = node.get("/First", None) diff --git a/pypdf/filters.py b/pypdf/filters.py index 7589c8051..e2fdd0d8c 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -746,9 +746,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, ) # for error reporting - if ( - hasattr(x_object_obj, "indirect_reference") and x_object_obj is None - ): # pragma: no cover + if x_object_obj is None: # pragma: no cover obj_as_text = x_object_obj.indirect_reference.__repr__() else: obj_as_text = x_object_obj.__repr__() diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 63ccf1bdc..d9b0ea488 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -46,6 +46,7 @@ PdfObject, TextStringObject, encode_pdfdocencoding, + is_null_or_none, ) from ._data_structures import ( ArrayObject, @@ -235,6 +236,7 @@ def link( "encode_pdfdocencoding", "decode_pdfdocencoding", "hex_to_rgb", + "is_null_or_none", "read_hex_string_from_stream", "read_string_from_stream", ] diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index d02a79810..fd7d1a8ff 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -214,6 +214,16 @@ def __repr__(self) -> str: return "NullObject" +def is_null_or_none(x: Any) -> bool: + """ + Returns: + True if x is None or NullObject. + """ + return x is None or ( + isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject) + ) + + class BooleanObject(PdfObject): def __init__(self, value: Any) -> None: self.value = value diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 215f2c75e..cc4b4a032 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -79,6 +79,7 @@ NumberObject, PdfObject, TextStringObject, + is_null_or_none, ) from ._fit import Fit from ._image_inline import ( @@ -451,7 +452,7 @@ def xmp_metadata(self) -> Optional[XmpInformationProtocol]: from ..xmp import XmpInformation metadata = self.get("/Metadata", None) - if metadata is None: + if is_null_or_none(metadata): return None metadata = metadata.get_object() @@ -651,7 +652,7 @@ def children(self) -> Iterable[Any]: if child == self[NameObject("/Last")]: return child_ref = child.get(NameObject("/Next")) # type: ignore - if child_ref is None: + if is_null_or_none(child_ref): return child = child_ref.get_object() @@ -661,8 +662,9 @@ def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None: def inc_parent_counter_default( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: - if parent is None: + if is_null_or_none(parent): return + assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) if "/Count" in parent: parent[NameObject("/Count")] = NumberObject( @@ -673,8 +675,9 @@ def inc_parent_counter_default( def inc_parent_counter_outline( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: - if parent is None: + if is_null_or_none(parent): return + assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) # BooleanObject requires comparison with == not is opn = parent.get("/%is_open%", True) == True # noqa diff --git a/pypdf/generic/_fit.py b/pypdf/generic/_fit.py index 4132f4b71..c44d12b4c 100644 --- a/pypdf/generic/_fit.py +++ b/pypdf/generic/_fit.py @@ -1,5 +1,7 @@ from typing import Any, Optional, Tuple, Union +from ._base import is_null_or_none + class Fit: def __init__( @@ -9,8 +11,7 @@ def __init__( self.fit_type = NameObject(fit_type) self.fit_args = [ - NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a) - for a in fit_args + NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args ] @classmethod diff --git a/pypdf/generic/_viewerpref.py b/pypdf/generic/_viewerpref.py index a12f2d446..72f89d9ae 100644 --- a/pypdf/generic/_viewerpref.py +++ b/pypdf/generic/_viewerpref.py @@ -32,7 +32,7 @@ Optional, ) -from ._base import BooleanObject, NameObject, NumberObject +from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none from ._data_structures import ArrayObject, DictionaryObject f_obj = BooleanObject(False) @@ -156,8 +156,8 @@ def _add_prop_int(key: str, deft: Optional[int]) -> property: def __init__(self, obj: Optional[DictionaryObject] = None) -> None: super().__init__(self) - if obj is not None: - self.update(obj.items()) + if not is_null_or_none(obj): + self.update(obj.items()) # type: ignore try: self.indirect_reference = obj.indirect_reference # type: ignore except AttributeError: diff --git a/tests/test_generic.py b/tests/test_generic.py index a13aa7b09..d5fad26d7 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -33,6 +33,7 @@ TreeObject, create_string_object, encode_pdfdocencoding, + is_null_or_none, read_hex_string_from_stream, read_object, read_string_from_stream, @@ -1139,3 +1140,18 @@ def test_missing_hashbin(): assert NullObject().hash_bin() == hash((NullObject,)) t = ByteStringObject(b"123") assert t.hash_bin() == hash((ByteStringObject, b"123")) + + +def test_is_null_or_none(): + assert is_null_or_none(NullObject()) + assert not is_null_or_none(PdfObject()) + + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + # used with get + assert is_null_or_none(reader.root_object.get("/do_no_exist")) + # object unknown... + assert is_null_or_none(IndirectObject(99999, 0, reader).get_object()) + # ... or which has been replaced with NullObject + writer = PdfWriter(reader) + writer.pages[0]["/Contents"].append(writer._add_object(NullObject())) + assert is_null_or_none(writer.pages[0]["/Contents"][-1]) From ac2983b156e05ac620cb917f3245134be9859aa2 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 14 Sep 2024 23:46:25 +0200 Subject: [PATCH 33/42] STY: Minor code-style improvements for _reader.py --- pypdf/_reader.py | 125 ++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 55 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 58c160302..4df360c97 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -123,9 +123,19 @@ def __init__( self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} self.trailer = DictionaryObject() - self._page_id2num: Optional[ - Dict[Any, Any] - ] = None # map page indirect_reference number to Page Number + # map page indirect_reference number to page number + self._page_id2num: Optional[Dict[Any, Any]] = None + + self._initialize_stream(stream) + + self._override_encryption = False + self._encryption: Optional[Encryption] = None + if self.is_encrypted: + self._handle_encryption(password) + elif password is not None: + raise PdfReadError("Not an encrypted file") + + def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: if hasattr(stream, "mode") and "b" not in stream.mode: logger_warning( "PdfReader stream/file object is not in binary mode. " @@ -141,31 +151,25 @@ def __init__( self.read(stream) self.stream = stream + def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: + self._override_encryption = True + # Some documents may not have a /ID, use two empty + # byte strings instead. Solves + # https://github.com/py-pdf/pypdf/issues/608 + id_entry = self.trailer.get(TK.ID) + id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" + encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) + self._encryption = Encryption.read(encrypt_entry, id1_entry) + + # try empty password if no password provided + pwd = password if password is not None else b"" + if ( + self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED + and password is not None + ): + # raise if password provided + raise WrongPasswordError("Wrong password") self._override_encryption = False - self._encryption: Optional[Encryption] = None - if self.is_encrypted: - self._override_encryption = True - # Some documents may not have a /ID, use two empty - # byte strings instead. Solves - # https://github.com/py-pdf/pypdf/issues/608 - id_entry = self.trailer.get(TK.ID) - id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" - encrypt_entry = cast( - DictionaryObject, self.trailer[TK.ENCRYPT].get_object() - ) - self._encryption = Encryption.read(encrypt_entry, id1_entry) - - # try empty password if no password provided - pwd = password if password is not None else b"" - if ( - self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED - and password is not None - ): - # raise if password provided - raise WrongPasswordError("Wrong password") - self._override_encryption = False - elif password is not None: - raise PdfReadError("Not encrypted file") def __enter__(self) -> "PdfReader": return self @@ -285,28 +289,29 @@ def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> Optional[int]: """ - Generate _page_id2num. + Retrieve the page number from an indirect reference. Args: - indirect_reference: + indirect_reference: The indirect reference to locate. Returns: - The page number or None + Page number or None. """ if self._page_id2num is None: self._page_id2num = { - x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore + x.indirect_reference.idnum: i + for i, x in enumerate(self.pages) + if x.indirect_reference is not None } if indirect_reference is None or isinstance(indirect_reference, NullObject): return None - if isinstance(indirect_reference, int): - idnum = indirect_reference - else: - idnum = indirect_reference.idnum - assert self._page_id2num is not None, "hint for mypy" - ret = self._page_id2num.get(idnum, None) - return ret + idnum = ( + indirect_reference + if isinstance(indirect_reference, int) + else indirect_reference.idnum + ) + return self._page_id2num.get(idnum) def _get_object_from_stream( self, indirect_reference: IndirectObject @@ -560,6 +565,12 @@ def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject return obj def read(self, stream: StreamType) -> None: + """ + Read and process the PDF stream, extracting necessary data. + + Args: + stream (StreamType): The PDF file stream. + """ self._basic_validation(stream) self._find_eof_marker(stream) startxref = self._find_startxref_pos(stream) @@ -619,7 +630,7 @@ def read(self, stream: StreamType) -> None: stream.seek(loc, 0) # return to where it was def _basic_validation(self, stream: StreamType) -> None: - """Ensure file is not empty. Read at most 5 bytes.""" + """Ensure the stream is valid and not empty.""" stream.seek(0, os.SEEK_SET) try: header_byte = stream.read(5) @@ -801,6 +812,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: def _read_xref_tables_and_trailers( self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int ) -> None: + """Read the cross-reference tables and trailers in the PDF stream.""" self.xref = {} self.xref_free_entry = {} self.xref_objStm = {} @@ -825,21 +837,12 @@ def _read_xref_tables_and_trailers( except Exception as e: if TK.ROOT in self.trailer: logger_warning( - f"Previous trailer can not be read {e.args}", - __name__, + f"Previous trailer cannot be read: {e.args}", __name__ ) break else: - raise PdfReadError(f"trailer can not be read {e.args}") - trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE - for key in trailer_keys: - if key in xrefstream and key not in self.trailer: - self.trailer[NameObject(key)] = xrefstream.raw_get(key) - if "/XRefStm" in xrefstream: - p = stream.tell() - stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) - self._read_pdf15_xref_stream(stream) - stream.seek(p, 0) + raise PdfReadError(f"Trailer cannot be read: {e.args}") + self._process_xref_stream(xrefstream) if "/Prev" in xrefstream: startxref = cast(int, xrefstream["/Prev"]) else: @@ -847,6 +850,18 @@ def _read_xref_tables_and_trailers( else: startxref = self._read_xref_other_error(stream, startxref) + def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: + """Process and handle the xref stream.""" + trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE + for key in trailer_keys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/XRefStm" in xrefstream: + p = self.stream.tell() + self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) + self._read_pdf15_xref_stream(self.stream) + self.stream.seek(p, 0) + def _read_xref(self, stream: StreamType) -> Optional[int]: self._read_standard_xref_table(stream) if stream.read(1) == b"": @@ -919,7 +934,7 @@ def _read_xref_other_error( def _read_pdf15_xref_stream( self, stream: StreamType ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: - # PDF 1.5+ Cross-Reference Stream + """Read the cross-reference stream for PDF 1.5+.""" stream.seek(-1, 1) idnum, generation = self.read_object_header(stream) xrefstream = cast(ContentStream, read_object(stream, self)) @@ -1047,6 +1062,7 @@ def _read_xref_subsections( get_entry: Callable[[int], Union[int, Tuple[int, ...]]], used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], ) -> None: + """Read and process the subsections of the xref.""" for start, size in self._pairs(idx_pairs): # The subsections must increase for num in range(start, start + size): @@ -1076,12 +1092,11 @@ def _read_xref_subsections( raise PdfReadError(f"Unknown xref type: {xref_type}") def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: + """Iterate over pairs in the array.""" i = 0 - while True: + while i + 1 < len(array): yield array[i], array[i + 1] i += 2 - if (i + 1) >= len(array): - break def decrypt(self, password: Union[str, bytes]) -> PasswordType: """ From dfa3d5ca8b672db162419652e79ddf83d7d48289 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 06:07:07 +0200 Subject: [PATCH 34/42] Fix tests --- pypdf/_reader.py | 1 + tests/test_encryption.py | 2 +- tests/test_reader.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 5e841ee3d..e5e1ff3ca 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -307,6 +307,7 @@ def _get_page_number_by_indirect( if is_null_or_none(indirect_reference): return None + assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" idnum = ( indirect_reference if isinstance(indirect_reference, int) diff --git a/tests/test_encryption.py b/tests/test_encryption.py index f5c494cb9..be92e40a9 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -205,7 +205,7 @@ def test_attempt_decrypt_unencrypted_pdf(): path = RESOURCE_ROOT / "crazyones.pdf" with pytest.raises(PdfReadError) as exc: PdfReader(path, password="nonexistent") - assert exc.value.args[0] == "Not encrypted file" + assert exc.value.args[0] == "Not an encrypted file" @pytest.mark.skipif(not HAS_AES, reason="No AES implementation") diff --git a/tests/test_reader.py b/tests/test_reader.py index 99555cd22..e495ebbc3 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1293,7 +1293,7 @@ def test_reader(caplog): url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - assert "Previous trailer can not be read" in caplog.text + assert "Previous trailer cannot be read" in caplog.text caplog.clear() # first call requires some reparations... reader.pages[0].extract_text() From 8eefba8f84e8bd8289ab1cb37cf1954273fd25eb Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Sun, 15 Sep 2024 16:51:49 +1000 Subject: [PATCH 35/42] BUG: test_image_without_pillow cannot find pypdf (#2850) test_image_without_pillow runs a generated script which causes the Python path to exclude the current directory. The generated script tries to import pypdf and either cannot find it or it finds the version in pyenv instead of the version being tested. Add "." to PYTHONPATH so the correct version of pypdf is used. Closes #2849 --- tests/test_filters.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_filters.py b/tests/test_filters.py index 632095888..c80553546 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -248,6 +248,7 @@ def test_issue_399(): @pytest.mark.enable_socket() def test_image_without_pillow(tmp_path): + import os name = "tika-914102.pdf" pdf_path = Path(__file__).parent / "pdf_cache" / name pdf_path_str = str(pdf_path.resolve()).replace("\\", "/") @@ -273,9 +274,15 @@ def test_image_without_pillow(tmp_path): ), exc.value.args[0] """ ) + env = os.environ.copy() + try: + env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"] + except KeyError: + env["PYTHONPATH"] = "." result = subprocess.run( [shutil.which("python"), source_file], # noqa: S603 capture_output=True, + env=env, ) assert result.returncode == 0 assert result.stdout == b"" From 6253b4b4cac1c76f0e00797611cfb9ab96e4b1e1 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 14:17:59 +0200 Subject: [PATCH 36/42] Update pypdf/_reader.py Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- pypdf/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index e5e1ff3ca..5c00b5a6a 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -302,7 +302,7 @@ def _get_page_number_by_indirect( self._page_id2num = { x.indirect_reference.idnum: i for i, x in enumerate(self.pages) - if x.indirect_reference is not None + if is_null_or_none(x.indirect_reference) } if is_null_or_none(indirect_reference): From bc3ae82e8bcba67a88466f420cfb4b576f2cebd1 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 14:27:00 +0200 Subject: [PATCH 37/42] fix doc building warning --- pypdf/_reader.py | 4 ++-- pypdf/generic/_base.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 5c00b5a6a..c0564f4c2 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -300,7 +300,7 @@ def _get_page_number_by_indirect( """ if self._page_id2num is None: self._page_id2num = { - x.indirect_reference.idnum: i + x.indirect_reference.idnum: i # type: ignore for i, x in enumerate(self.pages) if is_null_or_none(x.indirect_reference) } @@ -571,7 +571,7 @@ def read(self, stream: StreamType) -> None: Read and process the PDF stream, extracting necessary data. Args: - stream (StreamType): The PDF file stream. + stream: The PDF file stream. """ self._basic_validation(stream) self._find_eof_marker(stream) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index fd7d1a8ff..c2dd73668 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -28,11 +28,17 @@ import codecs import hashlib import re +import sys from binascii import unhexlify from math import log10 from struct import iter_unpack from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast +if sys.version_info[:2] >= (3, 10): + from typing import TypeGuard +else: + from typing_extensions import TypeGuard # PEP 647 + from .._codecs import _pdfdoc_encoding_rev from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( @@ -214,7 +220,7 @@ def __repr__(self) -> str: return "NullObject" -def is_null_or_none(x: Any) -> bool: +def is_null_or_none(x: Any) -> TypeGuard[None]: """ Returns: True if x is None or NullObject. From 7a4409f540b07050380398dcd03747d275925ad2 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 20:41:30 +0200 Subject: [PATCH 38/42] Undo is_null_or_none --- pypdf/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index c0564f4c2..f9d23fd87 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -302,7 +302,7 @@ def _get_page_number_by_indirect( self._page_id2num = { x.indirect_reference.idnum: i # type: ignore for i, x in enumerate(self.pages) - if is_null_or_none(x.indirect_reference) + if x.indirect_reference is None } if is_null_or_none(indirect_reference): From dd68fa1c9a9b64db7c6125f26adcc495303bfb79 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 21:18:45 +0200 Subject: [PATCH 39/42] Undo --- pypdf/_reader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index f9d23fd87..4b40ce0e7 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -300,9 +300,7 @@ def _get_page_number_by_indirect( """ if self._page_id2num is None: self._page_id2num = { - x.indirect_reference.idnum: i # type: ignore - for i, x in enumerate(self.pages) - if x.indirect_reference is None + x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore } if is_null_or_none(indirect_reference): From 7510d5404115ed2b75597f70fd5e57fb6969feff Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 15 Sep 2024 21:19:56 +0200 Subject: [PATCH 40/42] Undo --- pypdf/_reader.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 4b40ce0e7..def0fc7f1 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -306,12 +306,13 @@ def _get_page_number_by_indirect( if is_null_or_none(indirect_reference): return None assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" - idnum = ( - indirect_reference - if isinstance(indirect_reference, int) - else indirect_reference.idnum - ) - return self._page_id2num.get(idnum) + if isinstance(indirect_reference, int): + idnum = indirect_reference + else: + idnum = indirect_reference.idnum + assert self._page_id2num is not None, "hint for mypy" + ret = self._page_id2num.get(idnum, None) + return ret def _get_object_from_stream( self, indirect_reference: IndirectObject From 637bc44dd2f00803bd888c9dfb398f9794e3ff18 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:49:30 +0200 Subject: [PATCH 41/42] REL: 5.0.0 (#2851) ## Version 5.0.0, 2024-09-15 This version drops support for Python 3.7 (not maintained since July 2023), PdfMerger (use PdfWriter instead) and AnnotationBuilder (use annotations instead). ### Deprecations (DEP) - Remove the deprecated PfdMerger and AnnotationBuilder classes and other deprecations cleanup (#2813) - Drop Python 3.7 support (#2793) ### New Features (ENH) - Add capability to remove /Info from PDF (#2820) - Add incremental capability to PdfWriter (#2811) - Add UniGB-UTF16 encodings (#2819) - Accept utf strings for metadata (#2802) - Report PdfReadError instead of RecursionError (#2800) - Compress PDF files merging identical objects (#2795) ### Bug Fixes (BUG) - Fix sheared image (#2801) ### Robustness (ROB) - Robustify .set_data() (#2821) - Raise PdfReadError when missing /Root in trailer (#2808) - Fix extract_text() issues on damaged PDFs (#2760) - Handle images with empty data when processing an image from bytes (#2786) ### Developer Experience (DEV) - Fix coverage uploads (#2832) - Test against Python 3.13 (#2776) [Full Changelog](https://github.com/py-pdf/pypdf/compare/4.3.1...5.0.0) --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ pypdf/_version.py | 2 +- pyproject.toml | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c07dca099..496b9954b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # CHANGELOG +## Version 5.0.0, 2024-09-15 + +This version drops support for Python 3.7 (not maintained since July 2023), PdfMerger (use PdfWriter instead) and AnnotationBuilder (use annotations instead). + + +### Deprecations (DEP) +- Remove the deprecated PfdMerger and AnnotationBuilder classes and other deprecations cleanup (#2813) +- Drop Python 3.7 support (#2793) + +### New Features (ENH) +- Add capability to remove /Info from PDF (#2820) +- Add incremental capability to PdfWriter (#2811) +- Add UniGB-UTF16 encodings (#2819) +- Accept utf strings for metadata (#2802) +- Report PdfReadError instead of RecursionError (#2800) +- Compress PDF files merging identical objects (#2795) + +### Bug Fixes (BUG) +- Fix sheared image (#2801) + +### Robustness (ROB) +- Robustify .set_data() (#2821) +- Raise PdfReadError when missing /Root in trailer (#2808) +- Fix extract_text() issues on damaged PDFs (#2760) +- Handle images with empty data when processing an image from bytes (#2786) + +### Developer Experience (DEV) +- Fix coverage uploads (#2832) +- Test against Python 3.13 (#2776) + + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/4.3.1...5.0.0) + ## Version 4.3.1, 2024-07-21 ### Bug Fixes (BUG) diff --git a/pypdf/_version.py b/pypdf/_version.py index ed48cdab0..ba7be38e4 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "4.3.1" +__version__ = "5.0.0" diff --git a/pyproject.toml b/pyproject.toml index eb9e8a0a8..3da378ce9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,12 @@ classifiers = [ "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", From c00ec60318e9cfd280a6d383c7a97a222c4a254d Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 17 Sep 2024 19:44:14 +0100 Subject: [PATCH 42/42] DOC: Tiny changes (#2844) --- docs/dev/documentation.md | 2 +- pypdf/_page_labels.py | 2 +- pypdf/_writer.py | 21 ++++++++++----------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/dev/documentation.md b/docs/dev/documentation.md index f23ab0bcc..70cf81297 100644 --- a/docs/dev/documentation.md +++ b/docs/dev/documentation.md @@ -53,4 +53,4 @@ The title of the PR will be used as the first line of that combined commit messa The first comment within the commit will be used as the message body. -See [dev intro](intro.html#commit-messages) for more details. +See [developer intro](intro.html#commit-messages) for more details. diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index 1bedc003a..0dce8ec75 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -125,7 +125,7 @@ def number2lowercase_letter(number: int) -> str: def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str: # [Nums] shall be an array of the form - # [ key 1 value 1 key 2 value 2 ... key n value n ] + # [ key_1 value_1 key_2 value_2 ... key_n value_n ] # where each key_i is an integer and the corresponding # value_i shall be the object associated with that key. # The keys shall be sorted in numerical order, diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 4d4cca329..308c2e9c8 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -3132,16 +3132,16 @@ def set_page_label( Page indexes must be given starting from 0. Labels must have a style, a prefix or both. - If to a range is not assigned any page label a decimal label starting from 1 is applied. + If a range is not assigned any page label a decimal label starting from 1 is applied. Args: page_index_from: page index of the beginning of the range starting from 0 page_index_to: page index of the beginning of the range starting from 0 style: The numbering style to be used for the numeric portion of each page label: - * ``/D`` Decimal arabic numerals - * ``/R`` Uppercase roman numerals - * ``/r`` Lowercase roman numerals + * ``/D`` Decimal Arabic numerals + * ``/R`` Uppercase Roman numerals + * ``/r`` Lowercase Roman numerals * ``/A`` Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) * ``/a`` Lowercase letters (a to z for the first 26 pages, @@ -3180,18 +3180,17 @@ def _set_page_label( """ Set a page label to a range of pages. - Page indexes must be given - starting from 0. Labels must have a style, a prefix or both. If to a - range is not assigned any page label a decimal label starting from 1 is - applied. + Page indexes must be given starting from 0. + Labels must have a style, a prefix or both. + If a range is not assigned any page label a decimal label starting from 1 is applied. Args: page_index_from: page index of the beginning of the range starting from 0 page_index_to: page index of the beginning of the range starting from 0 style: The numbering style to be used for the numeric portion of each page label: - /D Decimal arabic numerals - /R Uppercase roman numerals - /r Lowercase roman numerals + /D Decimal Arabic numerals + /R Uppercase Roman numerals + /r Lowercase Roman numerals /A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) /a Lowercase letters (a to z for the first 26 pages,