Skip to content

Commit

Permalink
Add debug utils, fix output quality issues
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 18, 2024
1 parent 6d8e180 commit 8bd872b
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 22 deletions.
12 changes: 7 additions & 5 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class StructureBuilder(BaseBuilder):
gap_threshold: int = 10
gap_threshold: int = .05

def __init__(self, config=None):
super().__init__(config)
Expand All @@ -21,6 +21,7 @@ def __call__(self, document: Document):
self.group_lists(page)

def group_caption_blocks(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
Expand All @@ -32,18 +33,18 @@ def group_caption_blocks(self, page: PageGroup):
prev_block = page.get_block(prev_block_id)
if all([
prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px
]):
block_structure.insert(0, prev_block_id)
selected_polygons.append(prev_block.polygon)
selected_polygons.append(selected_polygons[0])
else:
break

for j, next_block_id in enumerate(page.structure[i + 1:]):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
]):
block_structure.append(next_block_id)
selected_polygons.append(next_block.polygon)
Expand All @@ -62,6 +63,7 @@ def group_caption_blocks(self, page: PageGroup):
page.remove_structure_items(block_structure)

def group_lists(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.ListItem]:
Expand All @@ -73,7 +75,7 @@ def group_lists(self, page: PageGroup):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type == BlockTypes.ListItem,
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
]):
block_structure.append(next_block_id)
selected_polygons.append(next_block.polygon)
Expand Down
16 changes: 13 additions & 3 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

from marker.v2.processors.sectionheader import SectionHeaderProcessor
from marker.v2.providers.pdf import PdfProvider
import tempfile
from typing import List, Optional

import click
import datasets
from pydantic import BaseModel

from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
Expand All @@ -20,6 +19,7 @@
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model
from marker.v2.renderers.markdown import MarkdownRenderer
from marker.v2.processors.debug import DebugProcessor


class PdfConverter(BaseConverter):
Expand Down Expand Up @@ -49,19 +49,29 @@ def __call__(self, filepath: str):
section_header_processor = SectionHeaderProcessor(self.config)
section_header_processor(document)

debug_processor = DebugProcessor(self.config)
debug_processor(document)

renderer = MarkdownRenderer(self.config)
return renderer(document)


@click.command()
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
@click.option("--fname", type=str, default="adversarial.pdf")
def main(output: str, fname: str):
@click.option("--debug", is_flag=True)
def main(output: str, fname: str, debug: bool):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(fname)
out_filename = fname.rsplit(".", 1)[0] + ".md"
os.makedirs(output, exist_ok=True)

config = {}
if debug:
config["debug_pdf_images"] = True
config["debug_layout_images"] = True
config["debug_json"] = True

with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()
Expand Down
148 changes: 148 additions & 0 deletions marker/v2/processors/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json
import os

import requests
from PIL import Image, ImageDraw, ImageFont

from marker.settings import settings
from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class DebugProcessor(BaseProcessor):
block_types = tuple()
debug_data_folder: str = "debug_data"
debug_layout_images: bool = False
debug_pdf_images: bool = False
debug_json: bool = False
render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"

def __call__(self, document: Document):
# Remove extension from doc name
doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
os.makedirs(self.debug_folder, exist_ok=True)

if self.debug_layout_images:
self.draw_layout_debug_images(document)
print(f"Dumped layout debug images to {self.debug_data_folder}")

if self.debug_pdf_images:
self.draw_layout_debug_images(document, pdf_mode=True)
print(f"Dumped PDF debug images to {self.debug_data_folder}")

if self.debug_json:
self.dump_block_debug_data(document)
print(f"Dumped block debug data to {self.debug_data_folder}")

def draw_layout_debug_images(self, document: Document, pdf_mode = False):
for idx, page in enumerate(document.pages):
img_size = page.highres_image.size
png_image = Image.new("RGB", img_size, color="white")
if pdf_mode:
png_image = page.highres_image.copy()

line_bboxes = []
line_text = []
for child in page.children:
if child.block_type != BlockTypes.Line:
continue

bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
line_bboxes.append(bbox)
line_text.append(child.raw_text(document))

if pdf_mode:
line_text = None

self.render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24)

layout_bboxes = []
layout_labels = []
for child in page.children:
if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
continue

bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
layout_bboxes.append(bbox)
layout_labels.append(str(child.block_type))

self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24)

order_labels = [str(i) for i in range(len(layout_bboxes))]
self.render_on_image(
layout_bboxes,
png_image,
labels=order_labels,
color="green",
draw_bbox=False,
label_offset=5
)

filecomp = "pdf" if pdf_mode else "layout"
debug_file = os.path.join(self.debug_folder, f"{filecomp}_page_{idx}.png")
png_image.save(debug_file)

def dump_block_debug_data(self, document: Document):
debug_file = os.path.join(self.debug_folder, f"blocks.json")
debug_data = []
for idx, page in enumerate(document.pages):
page_data = page.model_dump(exclude=["lowres_image", "highres_image"])
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f)

def get_font_path(self) -> str:
if not os.path.exists(self.render_font):
os.makedirs(os.path.dirname(self.render_font), exist_ok=True)
font_dl_path = f"{self.font_dl_path}/{os.path.basename(self.render_font)}"
with requests.get(font_dl_path, stream=True) as r, open(self.render_font, 'wb') as f:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

return self.render_font

def get_text_size(self, text, font):
im = Image.new(mode="P", size=(0, 0))
draw = ImageDraw.Draw(im)
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
return width, height

def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
draw = ImageDraw.Draw(image)
font_path = self.get_font_path()
label_font = ImageFont.truetype(font_path, label_font_size)

for i, bbox in enumerate(bboxes):
bbox = [int(p) for p in bbox]
if draw_bbox:
draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)

if labels is not None:
label = labels[i]
text_position = (
bbox[0] + label_offset,
bbox[1] + label_offset
)
text_size = self.get_text_size(label, label_font)
if text_size[0] <= 0 or text_size[1] <= 0:
continue
box_position = (
text_position[0],
text_position[1],
text_position[0] + text_size[0],
text_position[1] + text_size[1]
)
draw.rectangle(box_position, fill="white")
draw.text(
text_position,
label,
fill=color[i] if isinstance(color, list) else color,
font=label_font
)

return image
48 changes: 48 additions & 0 deletions marker/v2/processors/ignoretext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from collections import Counter

from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class IgnoreTextProcessor(BaseProcessor):
block_types = (BlockTypes.Text,)
common_element_threshold = .6
max_blocks = 1

def __call__(self, document: Document):
first_blocks = []
last_blocks = []
for page in document.pages:
initial_block = None
block = None
last_block = None
for block in page.children:
if block.block_type not in self.block_types:
continue

if initial_block is None:
initial_block = block

if block is not None:
last_block = block

if initial_block is not None:
first_blocks.append(initial_block)
if last_block is not None:
last_blocks.append(last_block)

self.filter_common_elements(document, first_blocks)
self.filter_common_elements(document, last_blocks)

def filter_common_elements(self, document, lines):
# We can't filter if we don't have enough pages to find common elements
if len(lines) < 3:
return []

text = [b.raw_text(document) for b in lines]
counter = Counter(text)
common = [k for k, v in counter.items() if v > len(lines) * self.common_element_threshold]
for b in lines:
if b.raw_text(document) in common:
b.is_header_footer = True
3 changes: 3 additions & 0 deletions marker/v2/schema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ class BlockTypes(Enum):
Text = auto()
TableOfContents = auto()
Document = auto()

def __str__(self):
return self.name
16 changes: 11 additions & 5 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from marker.v2.schema.blocks import Block


def replace_bullets(text):
def replace_bullets(child_blocks):
# Replace bullet characters with a -
bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
return replaced_string
first_block = None
while len(child_blocks) > 0:
first_block = child_blocks[0]
child_blocks = first_block.children

if first_block.id.block_type == BlockTypes.Line:
bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )"
first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)


class ListItem(Block):
Expand All @@ -17,5 +22,6 @@ class ListItem(Block):
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
template = replace_bullets(template)
# Remove the first bullet character
replace_bullets(child_blocks)
return f"<li>{template}</li>"
1 change: 1 addition & 0 deletions marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Text(Block):
block_type: BlockTypes = BlockTypes.Text

Expand Down
34 changes: 25 additions & 9 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,31 @@ def minimum_gap(self, other: PolygonBox):
if self.intersection_pct(other) > 0:
return 0

x_dist = min(abs(self.bbox[0] - other.bbox[2]), abs(self.bbox[2] - other.bbox[0]))
y_dist = min(abs(self.bbox[1] - other.bbox[3]), abs(self.bbox[3] - other.bbox[1]))

if x_dist == 0 or self.overlap_x(other) > 0:
return y_dist
if y_dist == 0 or self.overlap_y(other) > 0:
return x_dist

return (x_dist ** 2 + y_dist ** 2) ** 0.5
def dist(p1, p2):
return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5

left = other.bbox[2] < self.bbox[0]
right = self.bbox[2] < other.bbox[0]
bottom = other.bbox[3] < self.bbox[1]
top = self.bbox[3] < other.bbox[1]
if top and left:
return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1]))
elif left and bottom:
return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3]))
elif bottom and right:
return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3]))
elif right and top:
return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1]))
elif left:
return self.bbox[0] - other.bbox[2]
elif right:
return other.bbox[0] - self.bbox[2]
elif bottom:
return self.bbox[1] - other.bbox[3]
elif top:
return other.bbox[1] - self.bbox[3]
else:
return 0

def center_distance(self, other: PolygonBox):
return ((self.center[0] - other.center[0]) ** 2 + (self.center[1] - other.center[1]) ** 2) ** 0.5
Expand Down

0 comments on commit 8bd872b

Please sign in to comment.