Skip to content

Commit

Permalink
add in missing block creation logic
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 7, 2024
1 parent 3f3dd2f commit 25560f9
Showing 1 changed file with 53 additions and 10 deletions.
63 changes: 53 additions & 10 deletions marker/providers/pdf_parsing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
import statistics
from ctypes import byref, c_int, create_string_buffer
from typing import Any, Dict, List, TypedDict, Union

Expand Down Expand Up @@ -185,21 +186,63 @@ def get_lines(spans: Spans) -> Lines:


def get_blocks(lines: Lines) -> Blocks:
blocks: Blocks = []
block: Block = None
if not lines:
return []

x_diffs = []
y_diffs = []
for i in range(len(lines) - 1):
prev_center = lines[i]["bbox"].center
curr_center = lines[i + 1]["bbox"].center
x_diffs.append(abs(curr_center[0] - prev_center[0]))
y_diffs.append(abs(curr_center[1] - prev_center[1]))

median_x_gap = statistics.median(x_diffs)
median_y_gap = statistics.median(y_diffs)

tolerance_factor = 1.5
allowed_x_gap = median_x_gap * tolerance_factor
allowed_y_gap = median_y_gap * tolerance_factor

blocks: Blocks = []
for line in lines:
if blocks:
block = blocks[-1]
if not blocks:
# First block
blocks.append({"lines": [line], "bbox": line["bbox"]})
continue

if not block:
blocks.append({
"lines": [line],
"bbox": line["bbox"],
})
else:
block = blocks[-1]
last_line = block["lines"][-1]

last_center = last_line["bbox"].center
current_center = line["bbox"].center

x_diff = abs(current_center[0] - last_center[0])
y_diff = abs(current_center[1] - last_center[1])

if x_diff <= allowed_x_gap and y_diff <= allowed_y_gap:
block["lines"].append(line)
block["bbox"] = block["bbox"].merge([line["bbox"]])
continue

line_x_indented_start = last_line["bbox"].x_start > line["bbox"].x_start
if len(block["lines"]) == 1 and line_x_indented_start and y_diff <= allowed_y_gap:
block["lines"].append(line)
block["bbox"] = block["bbox"].merge([line["bbox"]])
continue

line_x_indented_end = last_line["bbox"].x_end > line["bbox"].x_end
if line_x_indented_end and y_diff <= allowed_y_gap:
block["lines"].append(line)
block["bbox"] = block["bbox"].merge([line["bbox"]])
continue

if y_diff < allowed_y_gap * 0.2 and last_line["bbox"].x_end > line["bbox"].x_start:
block["lines"].append(line)
block["bbox"] = block["bbox"].merge([line["bbox"]])
continue

blocks.append({"lines": [line], "bbox": line["bbox"]})

return blocks

Expand Down

0 comments on commit 25560f9

Please sign in to comment.