From e48364c7be191b1c867a3df1e1a37b3c34cb1674 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Fri, 20 Dec 2024 15:54:09 +0000
Subject: [PATCH] parse out and recreate spans in the high quality text
 processor and handle inline math rendering

---
 marker/processors/high_quality_text.py | 70 +++++++++++++++++++-------
 marker/providers/pdf.py                |  2 +-
 marker/renderers/markdown.py           |  9 +++-
 marker/schema/blocks/equation.py       |  2 +-
 marker/schema/text/line.py             |  9 ++--
 marker/schema/text/span.py             |  6 +++
 6 files changed, 73 insertions(+), 25 deletions(-)
diff --git a/marker/processors/high_quality_text.py b/marker/processors/high_quality_text.py
index afb23119..48afef62 100644
--- a/marker/processors/high_quality_text.py
+++ b/marker/processors/high_quality_text.py
@@ -1,6 +1,3 @@
-from marker.processors import BaseProcessor
-from marker.schema import BlockTypes
-from marker.schema.document import Document
 import json
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -8,10 +5,12 @@
 
 import google.generativeai as genai
 import PIL
+from bs4 import BeautifulSoup
 from google.ai.generativelanguage_v1beta.types import content
 from google.api_core.exceptions import ResourceExhausted
 from tqdm import tqdm
 
+from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.document import Document
@@ -67,7 +66,7 @@ class HighQualityTextProcessor(BaseProcessor):
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
-6. Ensure that inline math is properly enclosed in dollar signs.
+6. Ensure that inline math is properly with inline math tags.
 7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
 8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
 
@@ -96,7 +95,7 @@ class HighQualityTextProcessor(BaseProcessor):
   "the model's risk under the worst-case perturbations, is cur-\n",
   "rently the most effective approach for improving the robust-\n",
   "ness of deep neural networks. For a given neural network\n",
-  "$f(x, w)$ with parameters $w$, the optimization objective of\n",
+  "<math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of\n",
   "AT can be formulated as follows:\n"
  ]
 }
@@ -167,21 +166,54 @@ def process_block_rewriting(self, document: Document, page: PageGroup, block: Bl
 
         if corrected_lines and len(corrected_lines) == len(extracted_lines):
             for text_line, corrected_text in zip(text_lines, corrected_lines):
-                span_block = page.add_full_block(
-                    SpanClass(
-                        polygon=text_line.polygon,
-                        text=corrected_text + "\n",
-                        font='Unknown',
-                        font_weight=0,
-                        font_size=0,
-                        minimum_position=0,
-                        maximum_position=0,
-                        formats=['plain', 'math'],
-                        page_id=text_line.page_id,
-                        text_extraction_method="gemini",
+                text_line.structure = []
+                corrected_spans = self.text_to_spans(corrected_text)
+
+                for span_idx, span in enumerate(corrected_spans):
+                    if span_idx == len(corrected_spans) - 1:
+                        span['content'] += "\n"
+
+                    span_block = page.add_full_block(
+                        SpanClass(
+                            polygon=text_line.polygon,
+                            text=span['content'],
+                            font='Unknown',
+                            font_weight=0,
+                            font_size=0,
+                            minimum_position=0,
+                            maximum_position=0,
+                            formats=[span['type']],
+                            page_id=text_line.page_id,
+                            text_extraction_method="gemini",
+                        )
                     )
-                )
-                text_line.structure = [span_block.id]
+                    text_line.structure.append(span_block.id)
+
+    def text_to_spans(self, text):
+        soup = BeautifulSoup(text, 'html.parser')
+
+        tag_types = {
+            'b': 'bold',
+            'i': 'italic',
+            'math': 'math'
+        }
+        spans = []
+
+        for element in soup.descendants:
+            if not len(list(element.parents)) == 1:
+                continue
+            if element.name in tag_types:
+                spans.append({
+                    'type': tag_types[element.name],
+                    'content': element.get_text()
+                })
+            elif element.string:
+                spans.append({
+                    'type': 'plain',
+                    'content': element.string
+                })
+
+        return spans
 
     def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
         page_img = page.lowres_image
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 64a8d9a6..dd88c342 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -79,7 +79,7 @@ def font_flags_to_format(self, flags: int | None) -> Set[str]:
         formats = set()
         if set_flags == {"Symbolic", "Italic"} or \
                 set_flags == {"Symbolic", "Italic", "UseExternAttr"}:
-            formats.add("math")
+            formats.add("plain")
         elif set_flags == {"UseExternAttr"}:
             formats.add("plain")
         elif set_flags == {"Plain"}:
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index 0cadaf16..b7cc8ede 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -30,7 +30,7 @@ def convert_div(self, el, text, convert_as_inline):
         else:
             return text
 
-    def convert_p(self, el, text, *args):
+    def convert_p(self, el, text, convert_as_inline):
         hyphens = r'-—¬'
         has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
         if has_continuation:
@@ -43,6 +43,13 @@ def convert_p(self, el, text, *args):
                 return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
 
+    def convert_math(self, el, text, convert_as_inline):
+        block = el.has_attr('display') and el['display'] == 'block'
+        if block:
+            return text  # TODO: Fix block math handling
+
+        return f"${text}$"
+
 
 class MarkdownOutput(BaseModel):
     markdown: str
diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py
index 08caf707..a3d7478d 100644
--- a/marker/schema/blocks/equation.py
+++ b/marker/schema/blocks/equation.py
@@ -10,7 +10,7 @@ class Equation(Block):
 
     def assemble_html(self, child_blocks, parent_structure=None):
         if self.latex:
-            return f"\n<p><math>{html.escape(self.latex)}</math></p>\n"
+            return f"\n<p><math display=\"block\">{html.escape(self.latex)}</math></p>\n"
         else:
             template = super().assemble_html(child_blocks, parent_structure)
             return f"<p>{template}</p>"
diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py
index 460307c7..70469757 100644
--- a/marker/schema/text/line.py
+++ b/marker/schema/text/line.py
@@ -1,3 +1,4 @@
+import html
 import re
 
 import regex
@@ -38,12 +39,14 @@ class Line(Block):
     def formatted_text(self, document):
         text = ""
         for block in self.contained_blocks(document, (BlockTypes.Span,)):
+            block_text = html.escape(block.text)
+
             if block.italic:
-                text += f"*{block.text}*"
+                text += f"<i>{block_text}</i>"
             elif block.bold:
-                text += f"**{block.text}**"
+                text += f"<b>{block_text}</b>"
             else:
-                text += block.text
+                text += block_text
 
         return text
 
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
index 3496695c..50ab89de 100644
--- a/marker/schema/text/span.py
+++ b/marker/schema/text/span.py
@@ -31,6 +31,10 @@ def bold(self):
     def italic(self):
         return 'italic' in self.formats
 
+    @property
+    def math(self):
+        return 'math' in self.formats
+
     def assemble_html(self, child_blocks, parent_structure):
         if self.ignore_for_output:
             return ""
@@ -58,4 +62,6 @@ def assemble_html(self, child_blocks, parent_structure):
             return f"<i>{text}</i>"
         elif self.bold:
             return f"<b>{text}</b>"
+        elif self.math:
+            return f"<math>{text}</math>"
         return text