Skip to content

Commit

Permalink
ROB: Deal with content streams not containing streams (#3005)
Browse files Browse the repository at this point in the history
Closes #2995.
  • Loading branch information
stefan6419846 authored Jan 24, 2025
1 parent 4dc3e90 commit 1401bcf
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 12 deletions.
19 changes: 10 additions & 9 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2101,15 +2101,16 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
elif operator == b"TJ":
# The space width may be smaller than the font width, so the width should be 95%.
_confirm_space_width = _space_width * 0.95
for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
(abs(float(op)) >= _confirm_space_width)
and (len(text) > 0)
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
if operands:
for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
(abs(float(op)) >= _confirm_space_width)
and (len(text) > 0)
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
elif operator == b"Do":
output += text
if visitor_text is not None:
Expand Down
13 changes: 12 additions & 1 deletion pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,7 +1178,18 @@ def __init__(
if isinstance(stream, ArrayObject):
data = b""
for s in stream:
data += s.get_object().get_data()
s_resolved = s.get_object()
if isinstance(s_resolved, NullObject):
continue
if not isinstance(s_resolved, StreamObject):
# No need to emit an exception here for now - the PDF structure
# seems to already be broken beforehand in these cases.
logger_warning(
f"Expected StreamObject, got {type(s_resolved).__name__} instead. Data might be wrong.",
__name__
)
else:
data += s_resolved.get_data()
if len(data) == 0 or data[-1] != b"\n":
data += b"\n"
super().set_data(bytes(data))
Expand Down
10 changes: 10 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,3 +1209,13 @@ def test_coverage_streamobject():
co[NameObject("/testkey")] = NameObject("/test")
co.decoded_self = DecodedStreamObject()
assert "/testkey" in co.replicate(writer)


def test_contentstream_arrayobject_containing_nullobject(caplog):
stream_object = DecodedStreamObject()
stream_object.set_data(b"Hello World!")

input_stream = ArrayObject([NullObject(), stream_object])
content_stream = ContentStream(stream=input_stream, pdf=None)
assert content_stream.get_data() == b"Hello World!\n"
assert caplog.text == ""
29 changes: 27 additions & 2 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from pypdf import PdfReader, mult
from pypdf._text_extraction import set_custom_rtl
from pypdf.errors import ParseError
from pypdf.errors import ParseError, PdfReadError

from . import get_data_from_url

Expand Down Expand Up @@ -165,7 +165,7 @@ def test_layout_mode_indirect_sequence_font_widths():
# Cover the situation where the sequence for font widths is an IndirectObject
# ref https://github.com/py-pdf/pypdf/pull/2788
url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
name ="2788_example.pdf"
name = "2788_example.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].extract_text(extraction_mode="layout") == ""
url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
Expand All @@ -175,9 +175,11 @@ def test_layout_mode_indirect_sequence_font_widths():
reader.pages[0].extract_text(extraction_mode="layout")
assert str(exc.value).startswith("Invalid font width definition")


def dummy_visitor_text(text, ctm, tm, fd, fs):
pass


@patch("pypdf._page.logger_warning")
def test_layout_mode_warnings(mock_logger_warning):
# Check that a warning is issued when an argument is ignored
Expand Down Expand Up @@ -274,6 +276,29 @@ def test_infinite_loop_arrays():
assert "RNA structure comparison" in extracted


@pytest.mark.enable_socket
def test_content_stream_is_dictionary_object(caplog):
"""Tests for #2995."""
url = "https://github.com/user-attachments/files/18049322/6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf"
name = "iss2995.pdf"
data = get_data_from_url(url, name=name)

reader = PdfReader(BytesIO(data))
page = reader.pages[0]
assert "\nYours faithfully \n" in page.extract_text()
assert "Expected StreamObject, got DictionaryObject instead. Data might be wrong." in caplog.text
caplog.clear()

reader = PdfReader(BytesIO(data), strict=True)
page = reader.pages[0]
with pytest.raises(PdfReadError) as exception:
page.extract_text()
assert (
"Invalid Elementary Object starting with b\\'\\\\x18\\' @3557: b\\'ateDecode/Length 629\\\\x18ck["
in exception.value.args[0]
)


@pytest.mark.enable_socket
def test_tz_with_no_operands():
"""Tests for #2975"""
Expand Down

0 comments on commit 1401bcf

Please sign in to comment.