Skip to content

Commit

Permalink
feat: accept bytes as input (for async applications) (#65)
Browse files Browse the repository at this point in the history
* feat: accept `bytes` as input (for async applications)

* feat: add `playa.parse` to read from bytes buffer

* docs: add parse

* feat: just do not call read ever

* docs: clarify Document constructor behaviour

* docs: update more docs

* chore: mypy
  • Loading branch information
dhdaines authored Feb 21, 2025
1 parent 22d94ff commit ee37020
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 94 deletions.
1 change: 1 addition & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
options:
members:
- open
- parse

::: playa.document
options:
Expand Down
43 changes: 42 additions & 1 deletion playa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from multiprocessing.context import BaseContext
from typing import Union

from playa.worker import _init_worker
from playa.worker import _init_worker, _init_worker_buffer
from playa.color import Color, ColorSpace
from playa.document import Document, PageList
from playa.exceptions import PDFException
Expand Down Expand Up @@ -95,3 +95,44 @@ def open(
initargs=(id(pdf), path, password, space), # type: ignore[arg-type]
)
return pdf


def parse(
buffer: bytes,
*,
password: str = "",
space: DeviceSpace = "screen",
max_workers: Union[int, None] = 1,
mp_context: Union[BaseContext, None] = None,
) -> Document:
"""Read a PDF document from binary data.
Note: Potential slowness
When using multiple processes, this results in the entire
buffer being copied to the worker processes for the moment,
which may cause some overhead. It is preferable to use `open`
on a filesystem path if possible, since that uses
memory-mapped I/O.
Args:
buffer: Buffer containing PDF data.
space: Device space to use ("screen" for screen-like
coordinates, "page" for pdfminer.six-like coordinates, "default" for
default user space with no rotation or translation)
max_workers: Number of worker processes to use for parallel
processing of pages (if 1, no workers are spawned)
mp_context: Multiprocessing context to use for worker
processes, see [Contexts and Start
Methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
for more information.
"""
pdf = Document(buffer, password=password, space=space)
if max_workers is None or max_workers > 1:
pdf._pool = ProcessPoolExecutor(
max_workers=max_workers,
mp_context=mp_context,
initializer=_init_worker_buffer, # type: ignore[arg-type]
initargs=(id(pdf), buffer, password, space), # type: ignore[arg-type]
)
return pdf
159 changes: 82 additions & 77 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,76 +97,63 @@
INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}


def read_header(fp: BinaryIO) -> Tuple[str, int]:
"""Read the PDF header and return the (initial) version string and
its position.
Sets the file pointer to after the header (this is not reliable).
Note that this version can be overridden in the document catalog.
"""
try:
hdr = fp.read(8)
start = 0
except IOError as err:
raise PDFSyntaxError("Failed to read PDF header") from err
if not hdr.startswith(b"%PDF-"):
# Try harder... there might be some extra junk before it
fp.seek(0, 0)
hdr = fp.read(4096) # FIXME: this is arbitrary...
start = hdr.find(b"%PDF-")
if start == -1:
raise PDFSyntaxError("Could not find b'%%PDF-', is this a PDF?")
hdr = hdr[start : start + 8]
fp.seek(start + 8)
log.debug("Found header at position %d: %r", start, hdr)
def _find_header(buffer: Union[bytes, mmap.mmap]) -> Tuple[bytes, int]:
start = buffer.find(b"%PDF-")
if start == -1:
log.warning("Could not find b'%PDF-' header, is this a PDF?")
return b"", 0
return buffer[start : start + 8], start


def _open_input(fp: Union[BinaryIO, bytes]) -> Tuple[str, int, Union[bytes, mmap.mmap]]:
if isinstance(fp, bytes):
buffer: Union[bytes, mmap.mmap] = fp
else:
try:
buffer = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
except io.UnsupportedOperation:
log.warning("mmap not supported on %r, reading document into memory", fp)
buffer = fp.read()
except ValueError:
raise
hdr, offset = _find_header(buffer)
try:
version = hdr[5:].decode("ascii")
except UnicodeDecodeError as err:
raise PDFSyntaxError(
"Version number in %r contains non-ASCII characters" % hdr
) from err
except UnicodeDecodeError:
log.warning("Version number in header %r contains non-ASCII characters", hdr)
version = "1.0"
if not re.match(r"\d\.\d", version):
raise PDFSyntaxError("Version number in %r is invalid" % hdr)
return version, start


class OutlineItem(NamedTuple):
"""The most relevant fields of an outline item dictionary.
Danger: Deprecated
This interface is deprecated. It will be removed in PLAYA 1.0.
"""

level: int
title: str
dest: Union[PSLiteral, bytes, list, None]
action: Union[dict, None]
se: Union[ObjRef, None]
log.warning("Version number in header %r is invalid", hdr)
version = "1.0"
return version, offset, buffer


class Document:
"""Representation of a PDF document on disk.
"""Representation of a PDF document.
Since PDF documents can be very large and complex, merely creating
a `Document` does very little aside from opening the file and
verifying that the password is correct and it is, in fact, a PDF.
This may, however, involve a certain amount of file access since
the cross-reference table and trailer must be read in order to
determine this (we do not treat linearized PDFs specially for the
moment).
a `Document` does very little aside from verifying that the
password is correct and getting a minimal amount of metadata. In
general, PLAYA will try to open just about anything as a PDF, so
you should not expect the constructor to fail here if you give it
nonsense (something else may fail later on).
Some metadata, such as the structure tree and page tree, will be
loaded lazily and cached. We do not handle modification of PDFs.
Args:
fp: File-like object in binary mode. Will be read using
`mmap` if possible, otherwise will be read into memory.
fp: File-like object in binary mode, or a buffer with binary data.
Files Will be read using `mmap` if possible. They do not need
to be seekable, as if `mmap` fails the entire file will simply
be read into memory (so a pipe or socket ought to work).
password: Password for decryption, if needed.
space: the device space to use for interpreting content ("screen"
or "page")
or "page")
Raises:
TypeError: if `fp` is a file opened in text mode (don't do that!)
PDFEncryptionError: if the PDF has an unsupported encryption scheme
PDFPasswordIncorrect: if the password is incorrect
"""

_fp: Union[BinaryIO, None] = None
Expand All @@ -189,7 +176,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:

def __init__(
self,
fp: BinaryIO,
fp: Union[BinaryIO, bytes],
password: str = "",
space: DeviceSpace = "screen",
_boss_id: int = 0,
Expand All @@ -210,24 +197,7 @@ def __init__(
self._cached_fonts: Dict[object, Font] = {}
if isinstance(fp, io.TextIOBase):
raise TypeError("fp is not a binary file")
# The header is frequently mangled, in which case we will try to read the
# file anyway.
try:
self.pdf_version, self.offset = read_header(fp)
except PDFSyntaxError:
log.warning("PDF header not found, will try to read the file anyway")
self.pdf_version = "UNKNOWN"
self.offset = 0
try:
self.buffer: Union[bytes, mmap.mmap] = mmap.mmap(
fp.fileno(), 0, access=mmap.ACCESS_READ
)
except io.UnsupportedOperation:
log.warning("mmap not supported on %r, reading document into memory", fp)
fp.seek(0, 0)
self.buffer = fp.read()
except ValueError:
raise
self.pdf_version, self.offset, self.buffer = _open_input(fp)
self.is_printable = self.is_modifiable = self.is_extractable = True
# Getting the XRef table and trailer is done non-lazily
# because they contain encryption information among other
Expand Down Expand Up @@ -451,8 +421,9 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
def __getitem__(self, objid: int) -> PDFObject:
"""Get an indirect object from the PDF.
Note that the behaviour in the case of a non-existent object,
while Pythonic, is not PDFic, as PDF 1.7 sec 7.3.10 states:
Note that the behaviour in the case of a non-existent object
(raising `IndexError`), while Pythonic, is not PDFic, as PDF
1.7 sec 7.3.10 states:
> An indirect reference to an undefined object shall not be
considered an error by a conforming reader; it shall be
Expand All @@ -461,6 +432,7 @@ def __getitem__(self, objid: int) -> PDFObject:
Raises:
ValueError: if Document is not initialized
IndexError: if objid does not exist in PDF
"""
if not self.xrefs:
raise ValueError("Document is not initialized")
Expand Down Expand Up @@ -539,7 +511,7 @@ def outline(self) -> Union[Outline, None]:
return Outline(self)

@property
def outlines(self) -> Iterator[OutlineItem]:
def outlines(self) -> Iterator["OutlineItem"]:
"""Iterate over the PDF document outline.
Danger: Deprecated
Expand All @@ -552,7 +524,7 @@ def outlines(self) -> Iterator[OutlineItem]:
if "Outlines" not in self.catalog:
raise KeyError

def search(entry: object, level: int) -> Iterator[OutlineItem]:
def search(entry: object, level: int) -> Iterator["OutlineItem"]:
entry = dict_value(entry)
if "Title" in entry:
if "A" in entry or "Dest" in entry:
Expand Down Expand Up @@ -990,6 +962,15 @@ def __init__(self, doc: Document) -> None:
self.dests_tree = NameTree(names["Dests"])

def __iter__(self) -> Iterator[str]:
"""Iterate over named destinations.
Danger: Beware of corrupted PDFs
This simply iterates over the names listed in the PDF, nad
does not attempt to actually parse the destinations
(because that's pretty slow). If the PDF is broken, you
may encounter exceptions when actually trying to access
them by name.
"""
if self.dests_dict is not None:
yield from self.dests_dict
elif self.dests_tree is not None:
Expand All @@ -998,6 +979,16 @@ def __iter__(self) -> Iterator[str]:
yield ks

def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
"""Get a named destination.
Args:
name: The name of the destination.
Raises:
KeyError: If no such destination exists.
TypeError: If the PDF is damaged and the destinations tree
contains something unexpected or missing.
"""
if isinstance(name, bytes):
name = decode_text(name)
elif isinstance(name, PSLiteral):
Expand Down Expand Up @@ -1028,3 +1019,17 @@ def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
def doc(self) -> "Document":
"""Get associated document if it exists."""
return _deref_document(self._docref)


class OutlineItem(NamedTuple):
"""The most relevant fields of an outline item dictionary.
Danger: Deprecated
This interface is deprecated. It will be removed in PLAYA 1.0.
"""

level: int
title: str
dest: Union[PSLiteral, bytes, list, None]
action: Union[dict, None]
se: Union[ObjRef, None]
24 changes: 22 additions & 2 deletions playa/outline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

@dataclass
class Destination:
"""PDF destinations (PDF 1.7 sect 12.3.2)"""

_docref: DocumentRef
page_idx: Union[int, None]
display: Union[PSLiteral, None]
Expand Down Expand Up @@ -75,6 +77,8 @@ def from_list(cls, doc: "Document", dest: Sequence) -> "Destination":

@dataclass
class Action:
"""PDF actions (PDF 1.7 sect 12.6)"""

_docref: DocumentRef
props: Dict[str, PDFObject]

Expand All @@ -97,6 +101,8 @@ def destination(self) -> Union[Destination, None]:


class Outline:
"""PDF document outline (PDF 1.7 sect 12.3.3)"""

_docref: DocumentRef
props: Dict[str, PDFObject]

Expand Down Expand Up @@ -140,7 +146,17 @@ def title(self) -> Union[str, None]:

@property
def destination(self) -> Union[Destination, None]:
# Treat a GoTo action as equivalent to a destination
"""Destination for this outline item.
Note: Special case of `GoTo` actions.
Since internal `GoTo` actions (PDF 1.7 sect 12.6.4.2) in
outlines and links are entirely equivalent to
destinations, if one exists, it will be returned here as
well.
Returns:
destination, if one exists.
"""
dest = resolve1(self.props.get("Dest"))
if dest is not None:
return Destination.from_dest(self.doc, dest)
Expand All @@ -159,7 +175,11 @@ def action(self) -> Union[Action, None]:
@property
def element(self) -> Union[Element, None]:
"""The structure element associated with this outline item, if
any."""
any.
Returns:
structure element, if one exists.
"""
el = self.props.get("SE")
if el is None:
return None
Expand Down
10 changes: 10 additions & 0 deletions playa/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ def _init_worker(
GLOBAL_DOC = boss


def _init_worker_buffer(
boss: int, buffer: bytes, password: str = "", space: "DeviceSpace" = "screen"
) -> None:
from playa.document import Document

global __pdf, GLOBAL_DOC
__pdf = Document(buffer, password=password, space=space, _boss_id=boss)
GLOBAL_DOC = boss


def _set_document(doc: "Document", boss: int) -> None:
"""Call this in the worker process."""
global __pdf, GLOBAL_DOC
Expand Down
Loading

0 comments on commit ee37020

Please sign in to comment.