diff --git a/marker/processors/debug.py b/marker/processors/debug.py index 3d46b046..1c6d5df8 100644 --- a/marker/processors/debug.py +++ b/marker/processors/debug.py @@ -49,7 +49,8 @@ class DebugProcessor(BaseProcessor): def __call__(self, document: Document): # Remove extension from doc name - doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0] + filepath = document.filepath if isinstance(document.filepath, str) else "" + doc_base = os.path.basename(filepath).rsplit(".", 1)[0] self.debug_folder = os.path.join(self.debug_data_folder, doc_base) if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]): os.makedirs(self.debug_folder, exist_ok=True) diff --git a/marker/providers/__init__.py b/marker/providers/__init__.py index 6b389065..3d51f8d0 100644 --- a/marker/providers/__init__.py +++ b/marker/providers/__init__.py @@ -19,7 +19,7 @@ def raw_text(self): ProviderPageLines = Dict[int, List[ProviderOutput]] class BaseProvider: - def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None): + def __init__(self, filepath: str | bytes, config: Optional[BaseModel | dict] = None): assign_config(self, config) self.filepath = filepath diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 64a8d9a6..be38bfa5 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -26,8 +26,8 @@ class PdfProvider(BaseProvider): ocr_newline_threshold: float = .6 ocr_alphanum_threshold: float = .3 - def __init__(self, filepath: str, config=None): - super().__init__(filepath, config) + def __init__(self, file: str | bytes, config=None): + super().__init__(file, config) self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath) self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))} diff --git a/marker/schema/document.py b/marker/schema/document.py index d7ca4c73..67dad682 100644 --- a/marker/schema/document.py +++ b/marker/schema/document.py @@ -23,7 +23,7 @@ class TocItem(BaseModel): class Document(BaseModel): - filepath: str + filepath: str | bytes pages: List[PageGroup] block_type: BlockTypes = BlockTypes.Document table_of_contents: List[TocItem] | None = None