Skip to content

Commit

Permalink
WIP feat: accept binary PDF instead of just path to PDF
Browse files Browse the repository at this point in the history
While the library expects a filepath to be given to the
PdfConverter, the underlying pdfium.PdfDocument constructor
supports bytes and other types. Adding 'bytes' as an accepted
type for the Document Pydantic model allows binary PDFs to be
parsed by the library.
  • Loading branch information
aguadoenzo committed Dec 5, 2024
1 parent 1b95cc9 commit 901419a
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 5 deletions.
3 changes: 2 additions & 1 deletion marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ class DebugProcessor(BaseProcessor):

def __call__(self, document: Document):
# Remove extension from doc name
doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
filepath = document.filepath if isinstance(document.filepath, str) else ""
doc_base = os.path.basename(filepath).rsplit(".", 1)[0]
self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
os.makedirs(self.debug_folder, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion marker/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def raw_text(self):
ProviderPageLines = Dict[int, List[ProviderOutput]]

class BaseProvider:
def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
def __init__(self, filepath: str | bytes, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
self.filepath = filepath

Expand Down
4 changes: 2 additions & 2 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class PdfProvider(BaseProvider):
ocr_newline_threshold: float = .6
ocr_alphanum_threshold: float = .3

def __init__(self, filepath: str, config=None):
super().__init__(filepath, config)
def __init__(self, file: str | bytes, config=None):
super().__init__(file, config)

self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TocItem(BaseModel):


class Document(BaseModel):
filepath: str
filepath: str | bytes
pages: List[PageGroup]
block_type: BlockTypes = BlockTypes.Document
table_of_contents: List[TocItem] | None = None
Expand Down

0 comments on commit 901419a

Please sign in to comment.