diff --git a/marker/schema/page.py b/marker/schema/page.py index bf2bc6f9..23e643bc 100644 --- a/marker/schema/page.py +++ b/marker/schema/page.py @@ -3,7 +3,7 @@ from marker.schema.bbox import BboxElement from marker.schema.block import Block, Span -from surya.schema import TextDetectionResult, LayoutResult, OrderResult +from surya.schema import TextDetectionResult, LayoutResult class Page(BboxElement): @@ -12,7 +12,7 @@ class Page(BboxElement): rotation: Optional[int] = None # Rotation degrees of the page text_lines: Optional[TextDetectionResult] = None layout: Optional[LayoutResult] = None - order: Optional[OrderResult] = None + order: Optional[Any] = None ocr_method: Optional[str] = None # One of "surya" or "tesseract" char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index e4774c07..1b307b9c 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -1,3 +1,5 @@ +from marker.v2.providers.pdf import PdfProvider + import tempfile from typing import List, Optional @@ -10,7 +12,6 @@ from marker.v2.converters import BaseConverter from marker.v2.processors.equation import EquationProcessor from marker.v2.processors.table import TableProcessor -from marker.v2.providers.pdf import PdfProvider from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \ setup_detection_model @@ -36,8 +37,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): equation_processor(document) # TODO: re-enable once we add OCR method - #table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) - #table_processor(document) + # table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) + # table_processor(document) rendered = document.render() return rendered @@ -55,5 +56,3 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): rendered = converter(temp_pdf.name) print(rendered) - - diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index 4c9b391a..f6d1d07d 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -1,12 +1,12 @@ import functools -from typing import Dict, List, Tuple -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import pypdfium2 as pdfium from pdftext.extraction import dictionary_output from PIL import Image from pydantic import BaseModel +from marker.ocr.heuristics import detect_bad_ocr from marker.v2.providers import BaseProvider from marker.v2.schema.polygon import PolygonBox from marker.v2.schema.text.line import Line, Span @@ -110,7 +110,22 @@ def setup(self): ) ) line_spans.append(spans) - self.page_lines[page_id] = (lines, line_spans) + if self.check_line_spans(line_spans): + self.page_lines[page_id] = (lines, line_spans) + + def check_line_spans(self, line_spans_list: List[List[Span]]) -> bool: + if not len(sum(line_spans_list, [])): + return False + text = "" + for line_spans in line_spans_list: + for span in line_spans: + text = text + " " + span.text + text = text + "\n" + if len(text.strip()) == 0: + return False + if detect_bad_ocr(text): + return False + return True @ functools.lru_cache(maxsize=None) def get_image(self, idx: int, dpi: int) -> Image.Image: