From c10c3a01f0aaa0ea6ecbba37a49a69722642c5d3 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 2 May 2024 14:25:39 -0700 Subject: [PATCH] Output quality fixes --- README.md | 29 +++++++++--------- marker/cleaners/code.py | 2 +- marker/cleaners/table.py | 2 +- marker/equations/equations.py | 2 +- marker/ocr/recognition.py | 2 +- marker/ocr/utils.py | 43 +++++++++++++++------------ marker/pdf/extract_text.py | 2 +- marker/schema/{schema.py => block.py} | 2 ++ marker/schema/page.py | 2 +- 9 files changed, 46 insertions(+), 40 deletions(-) rename marker/schema/{schema.py => block.py} (96%) diff --git a/README.md b/README.md index a37e1e87..4e1132a1 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,12 @@ Marker converts PDF, EPUB, and MOBI to markdown. It's 10x faster than nougat, m Marker is a pipeline of deep learning models: -- Extract text, OCR if necessary (heuristics, tesseract) -- Detect page layout ([layout segmenter](https://huggingface.co/vikp/layout_segmenter), [column detector](https://huggingface.co/vikp/column_detector)) -- Clean and format each block (heuristics, [texify](https://huggingface.co/vikp/texify)) +- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya), tesseract) +- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) +- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify) - Combine blocks and postprocess complete text (heuristics, [pdf_postprocessor](https://huggingface.co/vikp/pdf_postprocessor_t5)) -Relying on autoregressive forward passes to generate text is slow and prone to hallucination/repetition. From the nougat paper: `We observed [repetition] in 1.5% of pages in the test set, but the frequency increases for out-of-domain documents.` In my anecdotal testing, repetitions happen on 5%+ of out-of-domain (non-arXiv) pages. - -Nougat is an amazing model, but I wanted a faster and more general purpose solution. Marker is 10x faster and has low hallucination risk because it only passes equation blocks through an LLM forward pass. +It only uses models where necessary, which improves speed and accuracy. ## Examples @@ -51,7 +49,6 @@ PDF is a tricky format, so marker will not always work perfectly. Here are some - Marker will not convert 100% of equations to LaTeX. This is because it has to first detect equations, then convert them. - Whitespace and indentations are not always respected. - Not all lines/spans will be joined properly. -- Languages similar to English (Spanish, French, German, Russian, etc) have the best support. There is provisional support for Chinese, Japanese, Korean, and Hindi, but it may not work as well. You can add other languages by adding them to the `TESSERACT_LANGUAGES` and `SPELLCHECK_LANGUAGES` settings in `settings.py`. - This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors. # Installation @@ -65,6 +62,13 @@ First, clone the repo: ## Linux +- Install python requirements + - `poetry install` + - `poetry shell` to activate your poetry venv +- Update pytorch since poetry doesn't play nicely with it + - GPU only: run `pip install torch` to install other torch dependencies. + - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions. + - Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`. - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`. @@ -72,22 +76,17 @@ First, clone the repo: - Set the tesseract data folder path - Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple. - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it + +## Mac + - Install python requirements - `poetry install` - `poetry shell` to activate your poetry venv -- Update pytorch since poetry doesn't play nicely with it - - GPU only: run `pip install torch` to install other torch dependencies. - - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions. - -## Mac - Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR - Set the tesseract data folder path - Find the tesseract data folder `tessdata` with `brew list tesseract` - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it -- Install python requirements - - `poetry install` - - `poetry shell` to activate your poetry venv # Usage diff --git a/marker/cleaners/code.py b/marker/cleaners/code.py index 6d5acb18..7d426124 100644 --- a/marker/cleaners/code.py +++ b/marker/cleaners/code.py @@ -1,4 +1,4 @@ -from marker.schema.schema import Span, Line +from marker.schema.block import Span, Line from marker.schema.page import Page import re from typing import List diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py index a95cc387..bfb0e200 100644 --- a/marker/cleaners/table.py +++ b/marker/cleaners/table.py @@ -1,5 +1,5 @@ from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox -from marker.schema.schema import Line, Span, Block +from marker.schema.block import Line, Span, Block from marker.schema.page import Page from tabulate import tabulate from typing import List, Dict diff --git a/marker/equations/equations.py b/marker/equations/equations.py index fddf4de2..a66b3d3e 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -5,7 +5,7 @@ from marker.equations.images import get_equation_image from marker.equations.inference import get_total_texify_tokens, get_latex_batched from marker.schema.page import Page -from marker.schema.schema import Line, Span, Block +from marker.schema.block import Line, Span, Block from marker.settings import settings diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 97f1d80a..d62624b8 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -11,7 +11,7 @@ from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr from marker.pdf.images import render_image from marker.schema.page import Page -from marker.schema.schema import Block, Line, Span +from marker.schema.block import Block, Line, Span from marker.settings import settings from marker.pdf.extract_text import get_text_blocks diff --git a/marker/ocr/utils.py b/marker/ocr/utils.py index 31dff6c2..2e0c80e8 100644 --- a/marker/ocr/utils.py +++ b/marker/ocr/utils.py @@ -1,28 +1,33 @@ from typing import Optional -def font_flags_decomposer(flags: Optional[int]): +def font_flags_decomposer(flags: Optional[int]) -> str: if flags is None: return "" - flags = int(flags) - - l = [] - if flags & 2 ** 0: - l.append("superscript") - if flags & 2 ** 1: - l.append("italic") - if flags & 2 ** 2: - l.append("serifed") - else: - l.append("sans") - if flags & 2 ** 3: - l.append("monospaced") - else: - l.append("proportional") - if flags & 2 ** 4: - l.append("bold") - return "_".join(l) + flag_descriptions = [] + if flags & (1 << 0): # PDFFONT_FIXEDPITCH + flag_descriptions.append("fixed_pitch") + if flags & (1 << 1): # PDFFONT_SERIF + flag_descriptions.append("serif") + if flags & (1 << 2): # PDFFONT_SYMBOLIC + flag_descriptions.append("symbolic") + if flags & (1 << 3): # PDFFONT_SCRIPT + flag_descriptions.append("script") + if flags & (1 << 5): # PDFFONT_NONSYMBOLIC + flag_descriptions.append("non_symbolic") + if flags & (1 << 6): # PDFFONT_ITALIC + flag_descriptions.append("italic") + if flags & (1 << 16): # PDFFONT_ALLCAP + flag_descriptions.append("all_cap") + if flags & (1 << 17): # PDFFONT_SMALLCAP + flag_descriptions.append("small_cap") + if flags & (1 << 18): # PDFFONT_FORCEBOLD + flag_descriptions.append("bold") + if flags & (1 << 19): # PDFFONT_USEEXTERNATTR + flag_descriptions.append("use_extern_attr") + + return "_".join(flag_descriptions) def alphanum_ratio(text): diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 53935ce2..d4312c61 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -8,7 +8,7 @@ from marker.ocr.utils import font_flags_decomposer from marker.ocr.heuristics import detect_bad_ocr from marker.settings import settings -from marker.schema.schema import Span, Line, Block +from marker.schema.block import Span, Line, Block from marker.schema.page import Page from pdftext.extraction import dictionary_output diff --git a/marker/schema/schema.py b/marker/schema/block.py similarity index 96% rename from marker/schema/schema.py rename to marker/schema/block.py index 9b1cdf54..ac5cf8b2 100644 --- a/marker/schema/schema.py +++ b/marker/schema/block.py @@ -17,6 +17,8 @@ class Span(BboxElement): font: str font_weight: float font_size: float + bold: Optional[bool] = None + italic: Optional[bool] = None @field_validator('text') diff --git a/marker/schema/page.py b/marker/schema/page.py index cedee79b..407939eb 100644 --- a/marker/schema/page.py +++ b/marker/schema/page.py @@ -2,7 +2,7 @@ from typing import List, Optional, Dict from marker.schema.bbox import BboxElement -from marker.schema.schema import Block, Span +from marker.schema.block import Block, Span from surya.schema import TextDetectionResult, LayoutResult, OrderResult