Skip to content

Commit

Permalink
parse out and recreate spans in the high quality text processor and h…
Browse files Browse the repository at this point in the history
…andle inline math rendering
  • Loading branch information
iammosespaulr committed Dec 20, 2024
1 parent 0bdc447 commit e48364c
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 25 deletions.
70 changes: 51 additions & 19 deletions marker/processors/high_quality_text.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional

import google.generativeai as genai
import PIL
from bs4 import BeautifulSoup
from google.ai.generativelanguage_v1beta.types import content
from google.api_core.exceptions import ResourceExhausted
from tqdm import tqdm

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
Expand Down Expand Up @@ -67,7 +66,7 @@ class HighQualityTextProcessor(BaseProcessor):
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
6. Ensure that inline math is properly enclosed in dollar signs.
6. Ensure that inline math is properly with inline math tags.
7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
Expand Down Expand Up @@ -96,7 +95,7 @@ class HighQualityTextProcessor(BaseProcessor):
"the model's risk under the worst-case perturbations, is cur-\n",
"rently the most effective approach for improving the robust-\n",
"ness of deep neural networks. For a given neural network\n",
"$f(x, w)$ with parameters $w$, the optimization objective of\n",
"<math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of\n",
"AT can be formulated as follows:\n"
]
}
Expand Down Expand Up @@ -167,21 +166,54 @@ def process_block_rewriting(self, document: Document, page: PageGroup, block: Bl

if corrected_lines and len(corrected_lines) == len(extracted_lines):
for text_line, corrected_text in zip(text_lines, corrected_lines):
span_block = page.add_full_block(
SpanClass(
polygon=text_line.polygon,
text=corrected_text + "\n",
font='Unknown',
font_weight=0,
font_size=0,
minimum_position=0,
maximum_position=0,
formats=['plain', 'math'],
page_id=text_line.page_id,
text_extraction_method="gemini",
text_line.structure = []
corrected_spans = self.text_to_spans(corrected_text)

for span_idx, span in enumerate(corrected_spans):
if span_idx == len(corrected_spans) - 1:
span['content'] += "\n"

span_block = page.add_full_block(
SpanClass(
polygon=text_line.polygon,
text=span['content'],
font='Unknown',
font_weight=0,
font_size=0,
minimum_position=0,
maximum_position=0,
formats=[span['type']],
page_id=text_line.page_id,
text_extraction_method="gemini",
)
)
)
text_line.structure = [span_block.id]
text_line.structure.append(span_block.id)

def text_to_spans(self, text):
soup = BeautifulSoup(text, 'html.parser')

tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math'
}
spans = []

for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue
if element.name in tag_types:
spans.append({
'type': tag_types[element.name],
'content': element.get_text()
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string
})

return spans

def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
page_img = page.lowres_image
Expand Down
2 changes: 1 addition & 1 deletion marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def font_flags_to_format(self, flags: int | None) -> Set[str]:
formats = set()
if set_flags == {"Symbolic", "Italic"} or \
set_flags == {"Symbolic", "Italic", "UseExternAttr"}:
formats.add("math")
formats.add("plain")
elif set_flags == {"UseExternAttr"}:
formats.add("plain")
elif set_flags == {"Plain"}:
Expand Down
9 changes: 8 additions & 1 deletion marker/renderers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def convert_div(self, el, text, convert_as_inline):
else:
return text

def convert_p(self, el, text, *args):
def convert_p(self, el, text, convert_as_inline):
hyphens = r'-—¬'
has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
if has_continuation:
Expand All @@ -43,6 +43,13 @@ def convert_p(self, el, text, *args):
return f"{text}"
return f"{text}\n\n" if text else "" # default convert_p behavior

def convert_math(self, el, text, convert_as_inline):
block = el.has_attr('display') and el['display'] == 'block'
if block:
return text # TODO: Fix block math handling

return f"${text}$"


class MarkdownOutput(BaseModel):
markdown: str
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class Equation(Block):

def assemble_html(self, child_blocks, parent_structure=None):
if self.latex:
return f"\n<p><math>{html.escape(self.latex)}</math></p>\n"
return f"\n<p><math display=\"block\">{html.escape(self.latex)}</math></p>\n"
else:
template = super().assemble_html(child_blocks, parent_structure)
return f"<p>{template}</p>"
9 changes: 6 additions & 3 deletions marker/schema/text/line.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
import re

import regex
Expand Down Expand Up @@ -38,12 +39,14 @@ class Line(Block):
def formatted_text(self, document):
text = ""
for block in self.contained_blocks(document, (BlockTypes.Span,)):
block_text = html.escape(block.text)

if block.italic:
text += f"*{block.text}*"
text += f"<i>{block_text}</i>"
elif block.bold:
text += f"**{block.text}**"
text += f"<b>{block_text}</b>"
else:
text += block.text
text += block_text

return text

Expand Down
6 changes: 6 additions & 0 deletions marker/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def bold(self):
def italic(self):
return 'italic' in self.formats

@property
def math(self):
return 'math' in self.formats

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
return ""
Expand Down Expand Up @@ -58,4 +62,6 @@ def assemble_html(self, child_blocks, parent_structure):
return f"<i>{text}</i>"
elif self.bold:
return f"<b>{text}</b>"
elif self.math:
return f"<math>{text}</math>"
return text

0 comments on commit e48364c

Please sign in to comment.