From e48364c7be191b1c867a3df1e1a37b3c34cb1674 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 20 Dec 2024 15:54:09 +0000 Subject: [PATCH] parse out and recreate spans in the high quality text processor and handle inline math rendering --- marker/processors/high_quality_text.py | 70 +++++++++++++++++++------- marker/providers/pdf.py | 2 +- marker/renderers/markdown.py | 9 +++- marker/schema/blocks/equation.py | 2 +- marker/schema/text/line.py | 9 ++-- marker/schema/text/span.py | 6 +++ 6 files changed, 73 insertions(+), 25 deletions(-) diff --git a/marker/processors/high_quality_text.py b/marker/processors/high_quality_text.py index afb23119..48afef62 100644 --- a/marker/processors/high_quality_text.py +++ b/marker/processors/high_quality_text.py @@ -1,6 +1,3 @@ -from marker.processors import BaseProcessor -from marker.schema import BlockTypes -from marker.schema.document import Document import json import time from concurrent.futures import ThreadPoolExecutor, as_completed @@ -8,10 +5,12 @@ import google.generativeai as genai import PIL +from bs4 import BeautifulSoup from google.ai.generativelanguage_v1beta.types import content from google.api_core.exceptions import ResourceExhausted from tqdm import tqdm +from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document @@ -67,7 +66,7 @@ class HighQualityTextProcessor(BaseProcessor): * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. 5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error. -6. Ensure that inline math is properly enclosed in dollar signs. +6. Ensure that inline math is properly with inline math tags. 7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. 8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. @@ -96,7 +95,7 @@ class HighQualityTextProcessor(BaseProcessor): "the model's risk under the worst-case perturbations, is cur-\n", "rently the most effective approach for improving the robust-\n", "ness of deep neural networks. For a given neural network\n", - "$f(x, w)$ with parameters $w$, the optimization objective of\n", + "f(x, w) with parameters w, the optimization objective of\n", "AT can be formulated as follows:\n" ] } @@ -167,21 +166,54 @@ def process_block_rewriting(self, document: Document, page: PageGroup, block: Bl if corrected_lines and len(corrected_lines) == len(extracted_lines): for text_line, corrected_text in zip(text_lines, corrected_lines): - span_block = page.add_full_block( - SpanClass( - polygon=text_line.polygon, - text=corrected_text + "\n", - font='Unknown', - font_weight=0, - font_size=0, - minimum_position=0, - maximum_position=0, - formats=['plain', 'math'], - page_id=text_line.page_id, - text_extraction_method="gemini", + text_line.structure = [] + corrected_spans = self.text_to_spans(corrected_text) + + for span_idx, span in enumerate(corrected_spans): + if span_idx == len(corrected_spans) - 1: + span['content'] += "\n" + + span_block = page.add_full_block( + SpanClass( + polygon=text_line.polygon, + text=span['content'], + font='Unknown', + font_weight=0, + font_size=0, + minimum_position=0, + maximum_position=0, + formats=[span['type']], + page_id=text_line.page_id, + text_extraction_method="gemini", + ) ) - ) - text_line.structure = [span_block.id] + text_line.structure.append(span_block.id) + + def text_to_spans(self, text): + soup = BeautifulSoup(text, 'html.parser') + + tag_types = { + 'b': 'bold', + 'i': 'italic', + 'math': 'math' + } + spans = [] + + for element in soup.descendants: + if not len(list(element.parents)) == 1: + continue + if element.name in tag_types: + spans.append({ + 'type': tag_types[element.name], + 'content': element.get_text() + }) + elif element.string: + spans.append({ + 'type': 'plain', + 'content': element.string + }) + + return spans def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01): page_img = page.lowres_image diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 64a8d9a6..dd88c342 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -79,7 +79,7 @@ def font_flags_to_format(self, flags: int | None) -> Set[str]: formats = set() if set_flags == {"Symbolic", "Italic"} or \ set_flags == {"Symbolic", "Italic", "UseExternAttr"}: - formats.add("math") + formats.add("plain") elif set_flags == {"UseExternAttr"}: formats.add("plain") elif set_flags == {"Plain"}: diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 0cadaf16..b7cc8ede 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -30,7 +30,7 @@ def convert_div(self, el, text, convert_as_inline): else: return text - def convert_p(self, el, text, *args): + def convert_p(self, el, text, convert_as_inline): hyphens = r'-—¬' has_continuation = el.has_attr('class') and 'has-continuation' in el['class'] if has_continuation: @@ -43,6 +43,13 @@ def convert_p(self, el, text, *args): return f"{text}" return f"{text}\n\n" if text else "" # default convert_p behavior + def convert_math(self, el, text, convert_as_inline): + block = el.has_attr('display') and el['display'] == 'block' + if block: + return text # TODO: Fix block math handling + + return f"${text}$" + class MarkdownOutput(BaseModel): markdown: str diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py index 08caf707..a3d7478d 100644 --- a/marker/schema/blocks/equation.py +++ b/marker/schema/blocks/equation.py @@ -10,7 +10,7 @@ class Equation(Block): def assemble_html(self, child_blocks, parent_structure=None): if self.latex: - return f"\n

{html.escape(self.latex)}

\n" + return f"\n

{html.escape(self.latex)}

\n" else: template = super().assemble_html(child_blocks, parent_structure) return f"

{template}

" diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py index 460307c7..70469757 100644 --- a/marker/schema/text/line.py +++ b/marker/schema/text/line.py @@ -1,3 +1,4 @@ +import html import re import regex @@ -38,12 +39,14 @@ class Line(Block): def formatted_text(self, document): text = "" for block in self.contained_blocks(document, (BlockTypes.Span,)): + block_text = html.escape(block.text) + if block.italic: - text += f"*{block.text}*" + text += f"{block_text}" elif block.bold: - text += f"**{block.text}**" + text += f"{block_text}" else: - text += block.text + text += block_text return text diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index 3496695c..50ab89de 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -31,6 +31,10 @@ def bold(self): def italic(self): return 'italic' in self.formats + @property + def math(self): + return 'math' in self.formats + def assemble_html(self, child_blocks, parent_structure): if self.ignore_for_output: return "" @@ -58,4 +62,6 @@ def assemble_html(self, child_blocks, parent_structure): return f"{text}" elif self.bold: return f"{text}" + elif self.math: + return f"{text}" return text