Skip to content

Commit

Permalink
fix extra chars that don't fit in neatly into lines
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 5, 2024
1 parent 95caca0 commit cfe08ad
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,51 @@ def merge_chars_into_bboxes(line_bboxes, chars, tolerance=0):
# Update the chars list with unmerged characters
chars = remaining_chars

for line in merged_lines:
line["chars"] = sorted(line["chars"], key=lambda c: c["char_idx"])

if remaining_chars:
line_ranges = []
for line in merged_lines:
if line["chars"]:
char_indices = [char["char_idx"] for char in line["chars"]]
line_ranges.append((min(char_indices), max(char_indices)))
else:
line_ranges.append((None, None))

for char in sorted(remaining_chars, key=lambda c: c["char_idx"]):
char_idx = char["char_idx"]
added_to_line = False

for i, (min_idx, max_idx) in enumerate(line_ranges):
if min_idx is not None and max_idx is not None and min_idx <= char_idx <= max_idx:
merged_lines[i]["chars"].append(char)
added_to_line = True
break

if not added_to_line:
for i, (min_idx, max_idx) in reversed(list(enumerate(line_ranges))):
if min_idx is None or max_idx is None:
continue
if min_idx < char_idx and char_idx > max_idx:
merged_lines[i]["chars"].append(char)
added_to_line = True
break

if not added_to_line:
print("Could not find line for char", char)
pass

for line in merged_lines:
if line["chars"]:
line["chars"] = sorted(line["chars"], key=lambda c: c["char_idx"])

min_x1 = min(char["bbox"][0] for char in line["chars"])
min_y1 = min(char["bbox"][1] for char in line["chars"])
max_x2 = max(char["bbox"][2] for char in line["chars"])
max_y2 = max(char["bbox"][3] for char in line["chars"])
line["bbox"] = [min_x1, min_y1, max_x2, max_y2]

return merged_lines


Expand Down

0 comments on commit cfe08ad

Please sign in to comment.