CodSpeedHQ · coco-speed · Jul 28, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -57,12 +57,12 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
         use-crypto-lib: ["cryptography"]
         include:
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "pycryptodome"
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "none"
     steps:
     - name: Update APT packages
@@ -83,14 +83,14 @@ jobs:
         key: cache-downloaded-files
     - name: Setup Python
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
         cache-dependency-path: '**/requirements/ci.txt'
     - name: Setup Python (3.11+)
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
       with:
         python-version: ${{ matrix.python-version }}
         allow-prereleases: true
@@ -102,11 +102,11 @@ jobs:
     - name: Install requirements (Python 3)
       run: |
         pip install -r requirements/ci.txt
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
     - name: Install requirements (Python 3.11+)
       run: |
         pip install -r requirements/ci-3.11.txt
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
     - name: Remove pycryptodome and cryptography
       run: |
         pip uninstall pycryptodome cryptography -y
@@ -135,6 +135,7 @@ jobs:
         name: coverage-data.${{ matrix.python-version }}-${{ matrix.use-crypto-lib }}
         path: .coverage.*
         if-no-files-found: ignore
+        include-hidden-files: true
 
   codestyle:
     name: Check code style issues

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
   contents: write
 
+env:
+  HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
   build_and_publish:
     name: Publish a new version
@@ -24,15 +27,15 @@ jobs:
       - name: Extract version from commit message
         id: extract_version
         run: |
-          VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+          VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
       - name: Extract tag message from commit message
         id: extract_message
         run: |
           VERSION="${{ steps.extract_version.outputs.version }}"
           delimiter="$(openssl rand -hex 8)"
-          MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+          MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
           echo "message<<${delimiter}" >> $GITHUB_OUTPUT
           echo "$MESSAGE" >> $GITHUB_OUTPUT
           echo "${delimiter}" >> $GITHUB_OUTPUT

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # CHANGELOG
 
+## Version 5.0.0, 2024-09-15
+
+This version drops support for Python 3.7 (not maintained since July 2023), PdfMerger (use PdfWriter instead) and AnnotationBuilder (use annotations instead).
+
+
+### Deprecations (DEP)
+- Remove the deprecated PfdMerger and AnnotationBuilder classes and other deprecations cleanup (#2813)
+- Drop Python 3.7 support (#2793)
+
+### New Features (ENH)
+- Add capability to remove /Info from PDF (#2820)
+- Add incremental capability to PdfWriter (#2811)
+- Add UniGB-UTF16 encodings (#2819)
+- Accept utf strings for metadata (#2802)
+- Report PdfReadError instead of RecursionError (#2800)
+- Compress PDF files merging identical objects (#2795)
+
+### Bug Fixes (BUG)
+- Fix sheared image (#2801)
+
+### Robustness (ROB)
+- Robustify .set_data() (#2821)
+- Raise PdfReadError when missing /Root in trailer (#2808)
+- Fix extract_text() issues on damaged PDFs (#2760)
+- Handle images with empty data when processing an image from bytes (#2786)
+
+### Developer Experience (DEV)
+- Fix coverage uploads (#2832)
+- Test against Python 3.13 (#2776)
+
+
+[Full Changelog](https://github.com/py-pdf/pypdf/compare/4.3.1...5.0.0)
+
 ## Version 4.3.1, 2024-07-21
 
 ### Bug Fixes (BUG)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)

diff --git a/docs/dev/documentation.md b/docs/dev/documentation.md
@@ -53,4 +53,4 @@ The title of the PR will be used as the first line of that combined commit messa
 
 The first comment within the commit will be used as the message body.
 
-See [dev intro](intro.html#commit-messages)  for more details.
+See [developer intro](intro.html#commit-messages) for more details.
diff --git a/docs/modules/PageObject.rst b/docs/modules/PageObject.rst
@@ -6,14 +6,12 @@ The PageObject Class
     :undoc-members:
     :show-inheritance:
 
-.. autoclass:: pypdf._utils.ImageFile
+.. autoclass:: pypdf._page.VirtualListImages
     :members:
     :undoc-members:
     :show-inheritance:
-    :exclude-members: IndirectObject
 
-.. autoclass:: pypdf._utils.File
+.. autoclass:: pypdf._page.ImageFile
     :members:
+    :inherited-members: File
     :undoc-members:
-    :show-inheritance:
-    :exclude-members: IndirectObject
diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -9,23 +9,17 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data is copied while respecting the original format.
+For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 
-```python
-from pypdf import PdfReader, PdfWriter
-
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-for page in reader.pages:
-    writer.add_page(page)
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 
-if reader.metadata is not None:
-    writer.add_metadata(reader.metadata)
+* `remove_identicals` enables/disables compression merging identical objects.
+* `remove_orphans` enables/disables suppression of unused objects.
 
-with open("smaller-new-file.pdf", "wb") as fp:
-    writer.write(fp)
-```
+It is recommended to apply this process just before writing to the file/stream.
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

diff --git a/docs/user/metadata.md b/docs/user/metadata.md
@@ -76,6 +76,30 @@ writer.add_metadata(
     }
 )
 
+# Clear all data but keep the entry in PDF
+writer.metadata = {}
+
+# Replace all entries with new set of entries
+writer.metadata = {
+    "/Author": "Martin",
+    "/Producer": "Libre Writer",
+}
+
+# Save the new PDF to a file
+with open("meta-pdf.pdf", "wb") as f:
+    writer.write(f)
+```
+
+## Removing metadata entry
+
+```python
+from pypdf import PdfWriter
+
+writer = PdfWriter("example.pdf")
+
+# Remove Metadata (/Info entry)
+writer.metadata = None
+
 # Save the new PDF to a file
 with open("meta-pdf.pdf", "wb") as f:
     writer.write(f)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -3,13 +3,12 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    IndirectObject,
-    NullObject,
     StreamObject,
+    is_null_or_none,
 )
 
 
@@ -127,6 +126,8 @@ def build_char_map_from_dict(
     "/ETenms-B5-V": "cp950",
     "/UniCNS-UTF16-H": "utf-16-be",
     "/UniCNS-UTF16-V": "utf-16-be",
+    "/UniGB-UTF16-H": "gb18030",
+    "/UniGB-UTF16-V": "gb18030",
     # UCS2 in code
 }
 
@@ -258,8 +259,8 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     tu = ft["/ToUnicode"]
     cm: bytes
     if isinstance(tu, StreamObject):
-        cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
-    elif isinstance(tu, str) and tu.startswith("/Identity"):
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
+    else:  # if (tu is None) or cast(str, tu).startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
     if isinstance(cm, str):
@@ -448,34 +449,27 @@ def compute_space_width(
             en: int = cast(int, ft["/LastChar"])
             if st > space_code or en < space_code:
                 raise Exception("Not in range")
-            if w[space_code - st] == 0:
+            if w[space_code - st].get_object() == 0:
                 raise Exception("null width")
-            sp_width = w[space_code - st]
+            sp_width = w[space_code - st].get_object()
         except Exception:
             if "/FontDescriptor" in ft and "/MissingWidth" in cast(
                 DictionaryObject, ft["/FontDescriptor"]
             ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
+                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
             else:
                 # will consider width of char as avg(width)/2
                 m = 0
                 cpt = 0
-                for x in w:
-                    if x > 0:
-                        m += x
+                for xx in w:
+                    xx = xx.get_object()
+                    if xx > 0:
+                        m += xx
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
 
-    if isinstance(sp_width, IndirectObject):
-        # According to
-        # 'Table 122 - Entries common to all font descriptors (continued)'
-        # the MissingWidth should be a number, but according to #2286 it can
-        # be an indirect object
-        obj = sp_width.get_object()
-        if obj is None or isinstance(obj, NullObject):
-            return 0.0
-        return obj  # type: ignore
-
+    if is_null_or_none(sp_width):
+        sp_width = 0.0
     return sp_width
 
 
@@ -488,8 +482,9 @@ def type1_alternative(
     if "/FontDescriptor" not in ft:
         return map_dict, space_code, int_entry
     ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
-    if ft_desc is None:
+    if is_null_or_none(ft_desc):
         return map_dict, space_code, int_entry
+    assert ft_desc is not None, "mypy"
     txt = ft_desc.get_object().get_data()
     txt = txt.split(b"eexec\n")[0]  # only clear part
     txt = txt.split(b"/Encoding")[1]  # to get the encoding part
Original file line number	Diff line number	Diff line change
Expand Up		@@ -53,4 +53,4 @@ The title of the PR will be used as the first line of that combined commit messa

		The first comment within the commit will be used as the message body.

		See [dev intro](intro.html#commit-messages) for more details.
		See [developer intro](intro.html#commit-messages) for more details.