Output quality fixes

VikParuchuri · May 2, 2024 · c10c3a0 · c10c3a0
1 parent 07ab29f
commit c10c3a0
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -13,14 +13,12 @@ Marker converts PDF, EPUB, and MOBI to markdown.  It's 10x faster than nougat, m
 
 Marker is a pipeline of deep learning models:
 
-- Extract text, OCR if necessary (heuristics, tesseract)
-- Detect page layout ([layout segmenter](https://huggingface.co/vikp/layout_segmenter), [column detector](https://huggingface.co/vikp/column_detector))
-- Clean and format each block (heuristics, [texify](https://huggingface.co/vikp/texify))
+- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya), tesseract)
+- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
+- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify)
 - Combine blocks and postprocess complete text (heuristics, [pdf_postprocessor](https://huggingface.co/vikp/pdf_postprocessor_t5))
 
-Relying on autoregressive forward passes to generate text is slow and prone to hallucination/repetition.  From the nougat paper: `We observed [repetition] in 1.5% of pages in the test set, but the frequency increases for out-of-domain documents.`  In my anecdotal testing, repetitions happen on 5%+ of out-of-domain (non-arXiv) pages.  
-
-Nougat is an amazing model, but I wanted a faster and more general purpose solution. Marker is 10x faster and has low hallucination risk because it only passes equation blocks through an LLM forward pass.
+It only uses models where necessary, which improves speed and accuracy.
 
 ## Examples
 
@@ -51,7 +49,6 @@ PDF is a tricky format, so marker will not always work perfectly.  Here are some
 - Marker will not convert 100% of equations to LaTeX.  This is because it has to first detect equations, then convert them.
 - Whitespace and indentations are not always respected.
 - Not all lines/spans will be joined properly.
-- Languages similar to English (Spanish, French, German, Russian, etc) have the best support. There is provisional support for Chinese, Japanese, Korean, and Hindi, but it may not work as well.  You can add other languages by adding them to the `TESSERACT_LANGUAGES` and `SPELLCHECK_LANGUAGES` settings in `settings.py`.
 - This works best on digital PDFs that won't require a lot of OCR.  It's optimized for speed, and limited OCR is used to fix errors.
 
 # Installation
@@ -65,29 +62,31 @@ First, clone the repo:
 
 ## Linux
 
+- Install python requirements
+  - `poetry install`
+  - `poetry shell` to activate your poetry venv
+- Update pytorch since poetry doesn't play nicely with it
+  - GPU only: run `pip install torch` to install other torch dependencies.
+  - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions.
+
 - Optional: Install system requirements, only needed if using `ocrmypdf` as the ocr backend
   - Optional: Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
   - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
   - Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
   - Set the tesseract data folder path
     - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
     - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
+
+## Mac
+
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
-- Update pytorch since poetry doesn't play nicely with it
-  - GPU only: run `pip install torch` to install other torch dependencies.
-  - CPU only: Uninstall torch with `poetry remove torch`, then follow the [CPU install](https://pytorch.org/get-started/locally/) instructions.
-
-## Mac
 
 - Optional: Install system requirements from `scripts/install/brew-requirements.txt`, only needed if using `ocrmypdf` for OCR
   - Set the tesseract data folder path
     - Find the tesseract data folder `tessdata` with `brew list tesseract`
     - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
-- Install python requirements
-  - `poetry install`
-  - `poetry shell` to activate your poetry venv
 
 # Usage
 

diff --git a/marker/cleaners/code.py b/marker/cleaners/code.py
@@ -1,4 +1,4 @@
-from marker.schema.schema import Span, Line
+from marker.schema.block import Span, Line
 from marker.schema.page import Page
 import re
 from typing import List

diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py
@@ -1,5 +1,5 @@
 from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
-from marker.schema.schema import Line, Span, Block
+from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page
 from tabulate import tabulate
 from typing import List, Dict

diff --git a/marker/equations/equations.py b/marker/equations/equations.py
@@ -5,7 +5,7 @@
 from marker.equations.images import get_equation_image
 from marker.equations.inference import get_total_texify_tokens, get_latex_batched
 from marker.schema.page import Page
-from marker.schema.schema import Line, Span, Block
+from marker.schema.block import Line, Span, Block
 from marker.settings import settings
 
 

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -11,7 +11,7 @@
 from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
 from marker.pdf.images import render_image
 from marker.schema.page import Page
-from marker.schema.schema import Block, Line, Span
+from marker.schema.block import Block, Line, Span
 from marker.settings import settings
 from marker.pdf.extract_text import get_text_blocks
 

diff --git a/marker/ocr/utils.py b/marker/ocr/utils.py
@@ -1,28 +1,33 @@
 from typing import Optional
 
 
-def font_flags_decomposer(flags: Optional[int]):
+def font_flags_decomposer(flags: Optional[int]) -> str:
     if flags is None:
         return ""
 
-    flags = int(flags)
-
-    l = []
-    if flags & 2 ** 0:
-        l.append("superscript")
-    if flags & 2 ** 1:
-        l.append("italic")
-    if flags & 2 ** 2:
-        l.append("serifed")
-    else:
-        l.append("sans")
-    if flags & 2 ** 3:
-        l.append("monospaced")
-    else:
-        l.append("proportional")
-    if flags & 2 ** 4:
-        l.append("bold")
-    return "_".join(l)
+    flag_descriptions = []
+    if flags & (1 << 0):  # PDFFONT_FIXEDPITCH
+        flag_descriptions.append("fixed_pitch")
+    if flags & (1 << 1):  # PDFFONT_SERIF
+        flag_descriptions.append("serif")
+    if flags & (1 << 2):  # PDFFONT_SYMBOLIC
+        flag_descriptions.append("symbolic")
+    if flags & (1 << 3):  # PDFFONT_SCRIPT
+        flag_descriptions.append("script")
+    if flags & (1 << 5):  # PDFFONT_NONSYMBOLIC
+        flag_descriptions.append("non_symbolic")
+    if flags & (1 << 6):  # PDFFONT_ITALIC
+        flag_descriptions.append("italic")
+    if flags & (1 << 16): # PDFFONT_ALLCAP
+        flag_descriptions.append("all_cap")
+    if flags & (1 << 17): # PDFFONT_SMALLCAP
+        flag_descriptions.append("small_cap")
+    if flags & (1 << 18): # PDFFONT_FORCEBOLD
+        flag_descriptions.append("bold")
+    if flags & (1 << 19): # PDFFONT_USEEXTERNATTR
+        flag_descriptions.append("use_extern_attr")
+
+    return "_".join(flag_descriptions)
 
 
 def alphanum_ratio(text):

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -8,7 +8,7 @@
 from marker.ocr.utils import font_flags_decomposer
 from marker.ocr.heuristics import detect_bad_ocr
 from marker.settings import settings
-from marker.schema.schema import Span, Line, Block
+from marker.schema.block import Span, Line, Block
 from marker.schema.page import Page
 from pdftext.extraction import dictionary_output
 

diff --git a/marker/schema/schema.py → marker/schema/block.py b/marker/schema/schema.py → marker/schema/block.py
@@ -17,6 +17,8 @@ class Span(BboxElement):
     font: str
     font_weight: float
     font_size: float
+    bold: Optional[bool] = None
+    italic: Optional[bool] = None
 
 
     @field_validator('text')

diff --git a/marker/schema/page.py b/marker/schema/page.py
@@ -2,7 +2,7 @@
 from typing import List, Optional, Dict
 
 from marker.schema.bbox import BboxElement
-from marker.schema.schema import Block, Span
+from marker.schema.block import Block, Span
 from surya.schema import TextDetectionResult, LayoutResult, OrderResult