parse out and recreate spans in the high quality text processor and h…

…andle inline math rendering
VikParuchuri · Dec 20, 2024 · e48364c · e48364c
1 parent 0bdc447
commit e48364c
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 25 deletions.
diff --git a/marker/processors/high_quality_text.py b/marker/processors/high_quality_text.py
@@ -1,17 +1,16 @@
-from marker.processors import BaseProcessor
-from marker.schema import BlockTypes
-from marker.schema.document import Document
 import json
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Optional
 
 import google.generativeai as genai
 import PIL
+from bs4 import BeautifulSoup
 from google.ai.generativelanguage_v1beta.types import content
 from google.api_core.exceptions import ResourceExhausted
 from tqdm import tqdm
 
+from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.document import Document
@@ -67,7 +66,7 @@ class HighQualityTextProcessor(BaseProcessor):
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
-6. Ensure that inline math is properly enclosed in dollar signs.
+6. Ensure that inline math is properly with inline math tags.
 7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
 8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
 
@@ -96,7 +95,7 @@ class HighQualityTextProcessor(BaseProcessor):
   "the model's risk under the worst-case perturbations, is cur-\n",
   "rently the most effective approach for improving the robust-\n",
   "ness of deep neural networks. For a given neural network\n",
-  "$f(x, w)$ with parameters $w$, the optimization objective of\n",
+  "<math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of\n",
   "AT can be formulated as follows:\n"
  ]
 }
@@ -167,21 +166,54 @@ def process_block_rewriting(self, document: Document, page: PageGroup, block: Bl
 
         if corrected_lines and len(corrected_lines) == len(extracted_lines):
             for text_line, corrected_text in zip(text_lines, corrected_lines):
-                span_block = page.add_full_block(
-                    SpanClass(
-                        polygon=text_line.polygon,
-                        text=corrected_text + "\n",
-                        font='Unknown',
-                        font_weight=0,
-                        font_size=0,
-                        minimum_position=0,
-                        maximum_position=0,
-                        formats=['plain', 'math'],
-                        page_id=text_line.page_id,
-                        text_extraction_method="gemini",
+                text_line.structure = []
+                corrected_spans = self.text_to_spans(corrected_text)
+
+                for span_idx, span in enumerate(corrected_spans):
+                    if span_idx == len(corrected_spans) - 1:
+                        span['content'] += "\n"
+
+                    span_block = page.add_full_block(
+                        SpanClass(
+                            polygon=text_line.polygon,
+                            text=span['content'],
+                            font='Unknown',
+                            font_weight=0,
+                            font_size=0,
+                            minimum_position=0,
+                            maximum_position=0,
+                            formats=[span['type']],
+                            page_id=text_line.page_id,
+                            text_extraction_method="gemini",
+                        )
                     )
-                )
-                text_line.structure = [span_block.id]
+                    text_line.structure.append(span_block.id)
+
+    def text_to_spans(self, text):
+        soup = BeautifulSoup(text, 'html.parser')
+
+        tag_types = {
+            'b': 'bold',
+            'i': 'italic',
+            'math': 'math'
+        }
+        spans = []
+
+        for element in soup.descendants:
+            if not len(list(element.parents)) == 1:
+                continue
+            if element.name in tag_types:
+                spans.append({
+                    'type': tag_types[element.name],
+                    'content': element.get_text()
+                })
+            elif element.string:
+                spans.append({
+                    'type': 'plain',
+                    'content': element.string
+                })
+
+        return spans
 
     def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
         page_img = page.lowres_image

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
@@ -79,7 +79,7 @@ def font_flags_to_format(self, flags: int | None) -> Set[str]:
         formats = set()
         if set_flags == {"Symbolic", "Italic"} or \
                 set_flags == {"Symbolic", "Italic", "UseExternAttr"}:
-            formats.add("math")
+            formats.add("plain")
         elif set_flags == {"UseExternAttr"}:
             formats.add("plain")
         elif set_flags == {"Plain"}:

diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
@@ -30,7 +30,7 @@ def convert_div(self, el, text, convert_as_inline):
         else:
             return text
 
-    def convert_p(self, el, text, *args):
+    def convert_p(self, el, text, convert_as_inline):
         hyphens = r'-—¬'
         has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
         if has_continuation:
@@ -43,6 +43,13 @@ def convert_p(self, el, text, *args):
                 return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
 
+    def convert_math(self, el, text, convert_as_inline):
+        block = el.has_attr('display') and el['display'] == 'block'
+        if block:
+            return text  # TODO: Fix block math handling
+
+        return f"${text}$"
+
 
 class MarkdownOutput(BaseModel):
     markdown: str

diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py
@@ -10,7 +10,7 @@ class Equation(Block):
 
     def assemble_html(self, child_blocks, parent_structure=None):
         if self.latex:
-            return f"\n<p><math>{html.escape(self.latex)}</math></p>\n"
+            return f"\n<p><math display=\"block\">{html.escape(self.latex)}</math></p>\n"
         else:
             template = super().assemble_html(child_blocks, parent_structure)
             return f"<p>{template}</p>"
diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py
@@ -1,3 +1,4 @@
+import html
 import re
 
 import regex
@@ -38,12 +39,14 @@ class Line(Block):
     def formatted_text(self, document):
         text = ""
         for block in self.contained_blocks(document, (BlockTypes.Span,)):
+            block_text = html.escape(block.text)
+
             if block.italic:
-                text += f"*{block.text}*"
+                text += f"<i>{block_text}</i>"
             elif block.bold:
-                text += f"**{block.text}**"
+                text += f"<b>{block_text}</b>"
             else:
-                text += block.text
+                text += block_text
 
         return text
 

diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
@@ -31,6 +31,10 @@ def bold(self):
     def italic(self):
         return 'italic' in self.formats
 
+    @property
+    def math(self):
+        return 'math' in self.formats
+
     def assemble_html(self, child_blocks, parent_structure):
         if self.ignore_for_output:
             return ""
@@ -58,4 +62,6 @@ def assemble_html(self, child_blocks, parent_structure):
             return f"<i>{text}</i>"
         elif self.bold:
             return f"<b>{text}</b>"
+        elif self.math:
+            return f"<math>{text}</math>"
         return text