From c3d8b1d8c850ac6e4d5dd2f635f69f07fe59d6c6 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 1 Dec 2023 09:22:52 -0800 Subject: [PATCH] Default to tesseract for OCR (faster than ocrmypdf) --- README.md | 25 +-- convert.py | 4 +- data/examples/marker/thinkpython.md | 268 +++++++++++++++++++--------- marker/cleaners/table.py | 2 +- marker/convert.py | 6 +- marker/extract_text.py | 12 +- marker/ocr/page.py | 17 +- marker/ocr/utils.py | 26 +-- marker/schema.py | 6 +- marker/settings.py | 1 + 10 files changed, 244 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index 2238da76..fd34d456 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,16 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. +# Limitations + +PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: + +- Marker will convert fewer equations to latex than nougat. This is because it has to first detect equations, then convert them without hallucation. +- Whitespace and indentations are not always respected. +- Not all lines/spans will be joined properly. +- Only languages similar to English (Spanish, French, German, Russian, etc) are supported. Languages with different character sets (Chinese, Japanese, Korean, etc) are not. +- This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors. + # Installation This has been tested on Mac and Linux (Ubuntu and Debian). You'll need python 3.9+ and [poetry](https://python-poetry.org/docs/#installing-with-the-official-installer). @@ -82,8 +92,9 @@ First, some configuration: - Set your torch device in the `local.env` file. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default. - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`. - Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors. -- By default, the final editor model is off. Turn it on with `ENABLE_EDITOR_MODEL`. -- Inspect the settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables. +- Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables. + - By default, the final editor model is off. Turn it on with `ENABLE_EDITOR_MODEL`. + - By default, marker will use ocrmypdf for OCR, which is slower than base tesseract, but higher quality. You can change this with the `OCR_ENGINE` setting. ## Convert a single file @@ -178,16 +189,6 @@ This will benchmark marker against other text extraction methods. It sets up ba Omit `--nougat` to exclude nougat from the benchmark. I don't recommend running nougat on CPU, since it is very slow. -# Limitations - -PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: - -- Marker will convert fewer equations to latex than nougat. This is because it has to first detect equations, then convert them without hallucation. -- Whitespace and indentations are not always respected. -- Not all lines/spans will be joined properly. -- Only languages similar to English (Spanish, French, German, Russian, etc) are supported. Languages with different character sets (Chinese, Japanese, Korean, etc) are not. -- This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors. - # Commercial usage Due to the licensing of the underlying models like layoutlmv3 and nougat, this is only suitable for noncommercial usage. diff --git a/convert.py b/convert.py index 59ea1d28..eb4499b6 100644 --- a/convert.py +++ b/convert.py @@ -1,6 +1,6 @@ import argparse import os -from typing import Dict +from typing import Dict, Optional import ray from tqdm import tqdm @@ -17,7 +17,7 @@ @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0) -def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Dict | None=None, min_length: int | None = None): +def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None): out_filename = fname.rsplit(".", 1)[0] + ".md" out_filename = os.path.join(out_folder, os.path.basename(out_filename)) out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json" diff --git a/data/examples/marker/thinkpython.md b/data/examples/marker/thinkpython.md index 62995409..9ff980ce 100644 --- a/data/examples/marker/thinkpython.md +++ b/data/examples/marker/thinkpython.md @@ -139,6 +139,65 @@ errors. - Ishwar Bhat corrected my statement of Fermat's last theorem. - Brian McGhie suggested a clarification. - Andrea Zanella translated the book into Italian, and sent a number of corrections along the way. +## Contents + +| | Preface | v | +|------|---------------------------------------|-----| +| 1 | The way of the program | 1 | +| 1.1 | The Python programming language | | +| 1.2 | What is a program? | 3 | +| 1.3 | What is debugging? | 3 | +| 1.4 | Formal and natural languages | 5 | +| 1.5 | The first program | 6 | +| 1.6 | Debugging | 7 | +| 1.7 | Glossary | | +| 1.8 | Exercises | 9 | +| 2 | Variables, expressions and statements | 11 | +| 2.1 | Values and types | | +| 2.2 | Variables | 12 | +| 2.3 | Variable names and keywords | 12 | +| 2.4 | Operators and operands | 13 | +| 2.5 | Expressions and statements | 14 | +| 2.6 | Interactive mode and script mode | 14 | +| 2.7 | Order of operations | 15 | +| 2.8 | String operations | | +| 2.9 | Comments | 16 | +| 2.1 | Debugging | 16 | +| 2.11 | Glossary | | +| 2.12 | Exercises | 18 | +| xviii | Contents | +|-----------------------------------------|---------------------------------| +| 12 Tuples | 113 | +| 12.1 | Tuples are immutable | +| 12.2 | Tuple assignment | +| 12.3 | Tuples as return values | +| 12.4 | Variable-length argument tuples | +| 12.5 | Lists and tuples | +| 12.6 | Dictionaries and tuples | +| 12.7 | Comparing tuples | +| 12.8 | Sequences of sequences | +| 12.9 | Debugging | +| 12.10 Glossary | | +| 12.11 Exercises | 121 | +| 13 Case study: data structure selection | 123 | +| 13.1 | Word frequency analysis | +| 13.2 | Random numbers | +| 13.3 | Word histogram | +| 13.4 | Most common words | +| 13.5 | Optional parameters | +| 13.6 | Dictionary subtraction | +| 13.7 | Random words | +| 13.8 | Markov analysis | +| 13.9 | Data structures | +| 13.10 Debugging | 131 | +| 13.11 Glossary | | +| 13.12 Exercises | 132 | +| 14 Files | 133 | +| 14.1 | Persistence | +| 14.2 | Reading and writing | +| 14.3 | Format operator | +| 14.4 | Filenames and paths | + ## Chapter 1 The Way Of The Program The goal of this book is to teach you to think like a computer scientist. This way of thinking combines some of the best features of mathematics, engineering, and natural science. Like mathematicians, computer scientists use formal languages to denote ideas (specifically computations). Like engineers, they design things, assembling components into systems and evaluating tradeoffs among alternatives. Like scientists, they observe the behavior of complex systems, form hypotheses, and test predictions. @@ -149,19 +208,14 @@ The single most important skill for a computer scientist is **problem solving**. The programming language you will learn is Python. Python is an example of a high-level language; other high-level languages you might have heard of are C, C++, Perl, and Java. There are also **low-level languages**, sometimes referred to as "machine languages" or "assembly languages." Loosely speaking, computers can only run programs written in lowlevel languages. So programs written in a high-level language have to be processed before they can run. This extra processing takes some time, which is a small disadvantage of high-level languages. The advantages are enormous. First, it is much easier to program in a high-level language. Programs written in a high-level language take less time to write, they are shorter and easier to read, and they are more likely to be correct. Second, high-level languages are portable, meaning that they can run on different kinds of computers with few or no modifications. Low-level programs can run on only one kind of computer and have to be rewritten to run on another. -| SOURCE | -|-------------| -| CODE | -| INTERPRETER | -| OUTPUT | -| CODE | -|----------| -| OBJECT | -| EXECUTOR | -| CODE | -| SOURCE | -| COMPILER | -| OUTPUT | +SOURCE CODE +INTERPRETER +OUTPUT +CODE OBJECT +EXECUTOR +CODE SOURCE +COMPILER +OUTPUT Due to these advantages, almost all programs are written in high-level languages. Lowlevel languages are used only for a few specialized applications. @@ -934,6 +988,8 @@ values they refer to. frame: A box in a stack diagram that represents a function call. It contains the local variables and parameters of the function. traceback: A list of the functions that are executing, printed when an exception occurs. +## 3.16 Exercises + Exercise 3.3. *Python provides a built-in function called* len that returns the length of a string, so the value of len('allen') is 5. Write a function named right_justify that takes a string named s as a parameter and prints the string with enough leading spaces so that the last letter of the string is in column 70 of the display. @@ -1472,12 +1528,19 @@ Exercise 5.1. *Draw a stack diagram for* print_n *called with* s = 'Hello' *and* Exercise 5.2. *Write a function called* do_n that takes a function object and a number, n, as arguments, and that calls the given function n times. -| | | | -|-----------|----|------------| -| countdown | n | 3 | -| countdown | n | 2 | -| countdown | n | 1 | -| countdown | n | 0 | + +countdown +n +3 +countdown +n +2 +countdown +n +1 +countdown +n +0 ## 5.10 Infinite Recursion @@ -1597,6 +1660,9 @@ branch: One of the alternative sequences of statements in a conditional statemen nested conditional: A conditional statement that appears in one of the branches of another conditional statement. recursion: The process of calling the function that is currently executing. base case: A conditional branch in a recursive function that does not make a recursive call. infinite recursion: A recursion that doesn't have a base case, or never reaches it. Eventually, an infinite recursion causes a runtime error. + +## 5.14 Exercises + Exercise 5.3. Fermat's Last Theorem says that there are no positive integers a, b, and c such that an + bn = cn @@ -1861,16 +1927,34 @@ The return value (1) is multiplied by n, which is 2, and the result is returned. The return value (2) is multiplied by n, which is 3, and the result, 6, becomes the return value of the function call that started the whole process. Figure 6.1 shows what the stack diagram looks like for this sequence of function calls. The return values are shown being passed back up the stack. In each frame, the return value is the value of result, which is the product of n and recurse. In the last frame, the local variables recurse and result do not exist, because the branch that creates them does not execute. -| | | | | | | 6 | -|-----------|----|----|---------|----|--------|--------| -| factorial | n | 3 | recurse | 2 | 6 | result | -| 2 | | | | | | | -| factorial | n | 2 | recurse | 1 | result | 2 | -| 1 | | | | | | | -| factorial | n | 1 | recurse | 1 | 1 | result | -| factorial | | | | | | | -| 1 | | | | | | | -| n | 0 | | | | | | +6 +factorial +n +3 +recurse +2 +6 +result +2 +factorial +n +2 +recurse +1 +result +2 +1 +factorial +n +1 +recurse +1 +1 +result +factorial +1 +n +0 ## 6.6 Leap Of Faith @@ -1992,6 +2076,9 @@ adding and testing only a small amount of code at a time. scaffolding: Code that is used during program development but is not part of the final version. guardian: A programming pattern that uses a conditional statement to check for and handle circumstances that might cause an error. + +## 6.11 Exercises + Exercise 6.4. Draw a stack diagram for the following program. What does the program print? Solution: http: // thinkpython. com/ code/ stack_ diagram. py . def b(z): @@ -2916,17 +3003,21 @@ As you might expect, you can assign list values to variables: The syntax for accessing the elements of a list is the same as for accessing the characters of a string—the bracket operator. The expression inside the brackets specifies the index. Remember that the indices start at 0: >>> print cheeses[0] Cheddar -| | | list | -|---------|---------|-----------| -| cheeses | 0 | 'Cheddar' | -| 1 | 'Edam' | | -| 2 | 'Gouda' | | -| list | | | -| numbers | 17 | 0 | -| 1 | 123 | | -| 5 | | | -| list | | | -| empty | | | +list +cheeses +0 +'Cheddar' +1 +'Edam' +2 +'Gouda' +list +numbers +17 +0 1 +123 5 +list +empty Unlike strings, lists are mutable. When the bracket operator appears on the left side of an assignment, it identifies the element of the list that will be assigned. @@ -3340,6 +3431,8 @@ identical: Being the same object (which implies equivalence). reference: The association between a variable and its value. aliasing: A circumstance where two or more variables refer to the same object. delimiter: A character or string used to indicate where a string should be split. +## 10.15 Exercises + Exercise 10.6. *Write a function called* is_sorted *that takes a list as a parameter and returns* True if the list is sorted in ascending order and False otherwise. You can assume (as a precondition) that the elements of the list can be compared with the relational operators <, >, etc. For example, is_sorted([1,2,2]) *should return* True *and* is_sorted(['b','a']) should return False. Exercise 10.7. Two words are anagrams if you can rearrange the letters from one to spell the other. Write a function called is_anagram *that takes two strings and returns* True if they are anagrams. @@ -3704,6 +3797,8 @@ global variable: A variable defined outside a function. Global variables can be flag: A boolean variable used to indicate whether a condition is true. declaration: A statement like global that tells the interpreter something about a variable. +## 11.10 Exercises + Exercise 11.9. *If you did Exercise 10.8, you already have a function named* has_duplicates that takes a list as a parameter and returns True if there is any object that appears more than once in the list. Use a dictionary to write a faster, simpler version of has_duplicates. @@ -4012,6 +4107,8 @@ data structure: A collection of related values, often organized in lists, dictio shape (of a data structure): A summary of the type, size and composition of a data structure. +## 12.11 Exercises + Exercise 12.3. *Write a function called* most_frequent that takes a string and prints the letters in decreasing order of frequency. Find text samples from several different languages and see how letter frequency varies between languages. Compare your results with the tables at http: // en. wikipedia. org/ wiki/ Letter_ frequencies *. Solution:* http: // thinkpython. com/ code/ most_ frequent. py . Exercise 12.4. More anagrams! @@ -4669,11 +4766,12 @@ class Point(object): """Represents a point in 2-D space.""" This header indicates that the new class is a Point, which is a kind of object, which is a built-in type. The body is a docstring that explains what the class is for. You can define variables and functions inside a class definition, but we will get back to that later. Defining a class named Point creates a class object. -| blank | -|---------| -| Point | -| x | -| y | +blank +Point +x +3.0 +y +4.0 >>> print Point @@ -4739,14 +4837,18 @@ class Rectangle(object): The docstring lists the attributes: width and height are numbers; corner is a Point object that specifies the lower-left corner. To represent a rectangle, you have to instantiate a Rectangle object and assign values to the attributes: box = Rectangle() box.width = 100.0 box.height = 200.0 -| | | Rectangle | -|--------|-------|-------------| -| box | width | 100 | -| height | 200.0 | | -| Point | | | -| 0.0 | x | | -| corner | | | -| y | 0.0 | | +Rectangle +box +width +100.0 +height +200.0 +Point +0.0 +x +corner +y +0.0 box.corner = Point() box.corner.x = 0.0 box.corner.y = 0.0 The expression box.corner.x means, "Go to the object box refers to and select the attribute named corner; then go to that object and select the attribute named x." @@ -4911,11 +5013,14 @@ statement. In the next few sections, we'll write two functions that add time values. They demonstrate two kinds of functions: pure functions and modifiers. They also demonstrate a development plan I'll call **prototype and patch**, which is a way of tackling a complex problem by starting with a simple prototype and incrementally dealing with the complications. -| | | Time | -|--------|------|--------| -| time | hour | 11 | -| minute | 59 | | -| second | 30 | | +Time +time +hour +11 +minute +59 +second +30 Here is a simple prototype of add_time: @@ -5403,34 +5508,32 @@ One possibility is to use strings containing words like 'Spade' for suits and 'Q An alternative is to use integers to **encode** the ranks and suits. In this context, "encode" means that we are going to define a mapping between numbers and suits, or between numbers and ranks. This kind of encoding is not meant to be a secret (that would be "encryption"). For example, this table shows the suits and the corresponding integer codes: -| Spades | -|----------| -| �→ | -| 3 | -| Hearts | -| �→ | -| 2 | -| Diamonds | -| �→ | -| 1 | -| Clubs | -| �→ | -| 0 | +Spades +�→ +3 +Hearts +�→ +2 +Diamonds +�→ +1 +Clubs +�→ +0 This code makes it easy to compare cards; because higher suits map to higher numbers, we can compare suits by comparing their codes. The mapping for ranks is fairly obvious; each of the numerical ranks maps to the corresponding integer, and for face cards: -| Jack | -|--------| -| �→ | -| 11 | -| Queen | -| �→ | -| 12 | -| King | -| �→ | -| 13 | +Jack +�→ +11 +Queen +�→ +12 +King +�→ +13 I am using the �→ symbol to make it clear that these mappings are not part of the Python program. They are part of the program design, but they don't appear explicitly in the code. The class definition for Card looks like this: class Card(object): @@ -6070,6 +6173,9 @@ corners. pack: To arrange and display the elements of a GUI. geometry manager: A system for packing widgets. binding: An association between a widget, an event, and an event handler. The event handler is called when the event occurs in the widget. + +## 19.11 Exercises + Exercise 19.4. For this exercise, you will write an image viewer. Here is a simple example: g = Gui() diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py index 7303c3b7..883b4ee6 100644 --- a/marker/cleaners/table.py +++ b/marker/cleaners/table.py @@ -77,7 +77,7 @@ def create_new_tables(blocks: List[Page]): table_rows.append(row) # Don't render tables if they will be too large - if max([len("".join(r)) for r in table_rows]) > 300 or len(table_rows[0]) > 8: + if max([len("".join(r)) for r in table_rows]) > 300 or len(table_rows[0]) > 8 or len(table_rows[0]) < 2: continue new_text = tabulate(table_rows, headers="firstrow", tablefmt="github") diff --git a/marker/convert.py b/marker/convert.py index 2a16b37e..ddf72dce 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -11,7 +11,7 @@ from marker.cleaners.bullets import replace_bullets from marker.markdown import merge_spans, merge_lines, get_full_text from marker.schema import Page, BlockType -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Optional from copy import deepcopy import re import magic @@ -59,7 +59,7 @@ def convert_single_pdf( fname: str, model_lst: List, max_pages=None, - metadata: Dict | None=None, + metadata: Optional[Dict]=None, parallel_factor: int = 1 ) -> Tuple[str, Dict]: lang = settings.DEFAULT_LANG @@ -68,7 +68,7 @@ def convert_single_pdf( # Use tesseract language if available tess_lang = settings.TESSERACT_LANGUAGES.get(lang, "eng") - spell_lang = settings.SPELLCHECK_LANGUAGES.get(lang, "en") + spell_lang = settings.SPELLCHECK_LANGUAGES.get(lang, None) if "eng" not in tess_lang: tess_lang = f"eng+{tess_lang}" diff --git a/marker/extract_text.py b/marker/extract_text.py index ebf7f4fc..81de2969 100644 --- a/marker/extract_text.py +++ b/marker/extract_text.py @@ -1,9 +1,9 @@ import os -from typing import Tuple, List +from typing import Tuple, List, Optional from spellchecker import SpellChecker -from marker.ocr.page import ocr_entire_page_ocrmp, ocr_entire_page_tess +from marker.ocr.page import ocr_entire_page from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer from marker.settings import settings from marker.schema import Span, Line, Block, Page @@ -12,10 +12,10 @@ os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX -def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: SpellChecker | None = None, ocr=False) -> Tuple[List[Block], int]: +def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]: page = doc[pnum] if ocr: - blocks = ocr_entire_page_ocrmp(page, tess_lang, spellchecker) + blocks = ocr_entire_page(page, tess_lang, spellchecker) else: blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"] @@ -57,7 +57,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: SpellCh return page_blocks -def convert_single_page(doc, pnum, tess_lang: str, spell_lang: str, no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2): +def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2): ocr_pages = 0 ocr_success = 0 ocr_failed = 0 @@ -90,7 +90,7 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: str, no_text: boo return page_obj, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success} -def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None = None, parallel: int = settings.OCR_PARALLEL_WORKERS): +def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS): all_blocks = [] toc = doc.get_toc() ocr_pages = 0 diff --git a/marker/ocr/page.py b/marker/ocr/page.py index b536caa7..8cc7c546 100644 --- a/marker/ocr/page.py +++ b/marker/ocr/page.py @@ -1,5 +1,5 @@ import io -from typing import List +from typing import List, Optional import fitz as pymupdf import ocrmypdf @@ -12,7 +12,17 @@ ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet) -def ocr_entire_page_tess(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]: +def ocr_entire_page(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]: + match settings.OCR_ENGINE: + case "tesseract": + return ocr_entire_page_tess(page, lang, spellchecker) + case "ocrmypdf": + return ocr_entire_page_ocrmp(page, lang, spellchecker) + case _: + raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}") + + +def ocr_entire_page_tess(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]: try: full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang) blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"] @@ -30,7 +40,7 @@ def ocr_entire_page_tess(page, lang: str, spellchecker: SpellChecker | None = No return blocks -def ocr_entire_page_ocrmp(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]: +def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]: # Use ocrmypdf to get OCR text for the whole page src = page.parent # the page's document blank_doc = pymupdf.open() # make temporary 1-pager @@ -46,6 +56,7 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: SpellChecker | None = N redo_ocr=True, progress_bar=False, optimize=False, + fast_web_view=1e6, skip_big=15, # skip images larger than 15 megapixels tesseract_timeout=settings.TESSERACT_TIMEOUT, tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT, diff --git a/marker/ocr/utils.py b/marker/ocr/utils.py index f1d18f3e..3da9e78c 100644 --- a/marker/ocr/utils.py +++ b/marker/ocr/utils.py @@ -1,9 +1,12 @@ +from typing import Optional + from nltk import wordpunct_tokenize from spellchecker import SpellChecker from marker.settings import settings +import re -def detect_bad_ocr(text, spellchecker: SpellChecker | None, misspell_threshold=.6, space_threshold=.5, newline_threshold=.4, alphanum_threshold=.4): +def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshold=.7, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4): if len(text) == 0: # Assume OCR failed if we have no text return True @@ -17,21 +20,21 @@ def detect_bad_ocr(text, spellchecker: SpellChecker | None, misspell_threshold=. if len(misspelled) > len(alpha_words) * misspell_threshold: return True - spaces = text.count(" ") - # More than 50% of chars are spaces - if spaces / len(text) > space_threshold: + spaces = len(re.findall(r'\s+', text)) + alpha_chars = len(re.sub(r'\s+', '', text)) + if spaces / (alpha_chars + spaces) > space_threshold: return True - newlines = text.count("\n") - # More than 40% of chars are newlines - if newlines / len(text) > newline_threshold: + newlines = len(re.findall(r'\n+', text)) + non_newlines = len(re.sub(r'\n+', '', text)) + if newlines / (newlines + non_newlines) > newline_threshold: return True if alphanum_ratio(text) < alphanum_threshold: # Garbled text return True invalid_chars = len([c for c in text if c in settings.INVALID_CHARS]) - if invalid_chars > max(3.0, len(text) * .03): + if invalid_chars > max(3.0, len(text) * .02): return True return False @@ -58,13 +61,12 @@ def font_flags_decomposer(flags): def alphanum_ratio(text): + text = text.replace(" ", "") + text = text.replace("\n", "") alphanumeric_count = sum([1 for c in text if c.isalnum()]) if len(text) == 0: - if alphanumeric_count == 0: - return 1 - else: - return 0 + return 1 ratio = alphanumeric_count / len(text) return ratio diff --git a/marker/schema.py b/marker/schema.py index cfa4e2a1..87da416f 100644 --- a/marker/schema.py +++ b/marker/schema.py @@ -58,9 +58,9 @@ class Span(BboxElement): span_id: str font: str color: int - ascender: float | None = None - descender: float | None = None - block_type: str | None = None + ascender: Optional[float] = None + descender: Optional[float] = None + block_type: Optional[str] = None selected: bool = True diff --git a/marker/settings.py b/marker/settings.py index cb548a67..cc9e438a 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -50,6 +50,7 @@ class Settings(BaseSettings): } OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR + OCR_ENGINE: str = "ocrmypdf" # Which OCR engine to use, either "tesseract" or "ocrmypdf". Ocrmypdf is higher quality, but slower. # Nougat model NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat