Skip to content

Commit

Permalink
drop lines and spans from the provider if we detect bad text [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 15, 2024
1 parent 983d00f commit c564341
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 10 deletions.
4 changes: 2 additions & 2 deletions marker/schema/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from marker.schema.bbox import BboxElement
from marker.schema.block import Block, Span
from surya.schema import TextDetectionResult, LayoutResult, OrderResult
from surya.schema import TextDetectionResult, LayoutResult


class Page(BboxElement):
Expand All @@ -12,7 +12,7 @@ class Page(BboxElement):
rotation: Optional[int] = None # Rotation degrees of the page
text_lines: Optional[TextDetectionResult] = None
layout: Optional[LayoutResult] = None
order: Optional[OrderResult] = None
order: Optional[Any] = None
ocr_method: Optional[str] = None # One of "surya" or "tesseract"
char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext
images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error
Expand Down
9 changes: 4 additions & 5 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from marker.v2.providers.pdf import PdfProvider

import tempfile
from typing import List, Optional

Expand All @@ -10,7 +12,6 @@
from marker.v2.converters import BaseConverter
from marker.v2.processors.equation import EquationProcessor
from marker.v2.processors.table import TableProcessor
from marker.v2.providers.pdf import PdfProvider
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model

Expand All @@ -36,8 +37,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
equation_processor(document)

# TODO: re-enable once we add OCR method
#table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
#table_processor(document)
# table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
# table_processor(document)

rendered = document.render()
return rendered
Expand All @@ -55,5 +56,3 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
rendered = converter(temp_pdf.name)

print(rendered)


21 changes: 18 additions & 3 deletions marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import functools
from typing import Dict, List, Tuple
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

import pypdfium2 as pdfium
from pdftext.extraction import dictionary_output
from PIL import Image
from pydantic import BaseModel

from marker.ocr.heuristics import detect_bad_ocr
from marker.v2.providers import BaseProvider
from marker.v2.schema.polygon import PolygonBox
from marker.v2.schema.text.line import Line, Span
Expand Down Expand Up @@ -110,7 +110,22 @@ def setup(self):
)
)
line_spans.append(spans)
self.page_lines[page_id] = (lines, line_spans)
if self.check_line_spans(line_spans):
self.page_lines[page_id] = (lines, line_spans)

def check_line_spans(self, line_spans_list: List[List[Span]]) -> bool:
if not len(sum(line_spans_list, [])):
return False
text = ""
for line_spans in line_spans_list:
for span in line_spans:
text = text + " " + span.text
text = text + "\n"
if len(text.strip()) == 0:
return False
if detect_bad_ocr(text):
return False
return True

@ functools.lru_cache(maxsize=None)
def get_image(self, idx: int, dpi: int) -> Image.Image:
Expand Down

0 comments on commit c564341

Please sign in to comment.