Skip to content

Commit

Permalink
Merge pull request #138 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Faster text extraction
  • Loading branch information
VikParuchuri authored May 23, 2024
2 parents cc9d830 + 0281aea commit 0d9b0db
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 258 deletions.
1 change: 1 addition & 0 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def convert_single_pdf(
doc = pdfium.PdfDocument(fname)
pages, toc = get_text_blocks(
doc,
fname,
max_pages=max_pages,
)
out_meta.update({
Expand Down
8 changes: 6 additions & 2 deletions marker/ocr/recognition.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import tempfile
from itertools import repeat
from typing import List, Optional, Dict

Expand Down Expand Up @@ -160,9 +161,12 @@ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
)

new_doc = pdfium.PdfDocument(out_pdf.getvalue())
with tempfile.NamedTemporaryFile() as f:
f.write(out_pdf.getvalue())
f.seek(0)
new_doc = pdfium.PdfDocument(f.name)
blocks, _ = get_text_blocks(new_doc, f.name, max_pages=1)

blocks, _ = get_text_blocks(new_doc, max_pages=1)
page = blocks[0]
page.ocr_method = "tesseract"
return page
4 changes: 2 additions & 2 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
return out_page


def get_text_blocks(doc, max_pages: Optional[int] = None) -> (List[Page], Dict):
def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
toc = get_toc(doc)

page_range = range(len(doc))
if max_pages:
range_end = min(max_pages, len(doc))
page_range = range(range_end)

char_blocks = dictionary_output(doc, page_range=page_range, keep_chars=True)
char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]

return marker_blocks, toc
Expand Down
3 changes: 3 additions & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def TORCH_DEVICE_MODEL(self) -> str:
"application/pdf": "pdf",
}

# Text extraction
PDFTEXT_CPU_WORKERS: int = 4 # How many CPU workers to use for pdf text extraction

# Text line Detection
DETECTOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for CPU, 12 otherwise
SURYA_DETECTOR_DPI: int = 96
Expand Down
Loading

0 comments on commit 0d9b0db

Please sign in to comment.