From cfe08add5d10a1c5fd8bd88861da8e65d59eb7c0 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Thu, 5 Dec 2024 17:11:46 +0000 Subject: [PATCH] fix extra chars that don't fit in neatly into lines --- marker/providers/pdf.py | 45 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index ee469e17..e5e3a439 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -157,6 +157,51 @@ def merge_chars_into_bboxes(line_bboxes, chars, tolerance=0): # Update the chars list with unmerged characters chars = remaining_chars + for line in merged_lines: + line["chars"] = sorted(line["chars"], key=lambda c: c["char_idx"]) + + if remaining_chars: + line_ranges = [] + for line in merged_lines: + if line["chars"]: + char_indices = [char["char_idx"] for char in line["chars"]] + line_ranges.append((min(char_indices), max(char_indices))) + else: + line_ranges.append((None, None)) + + for char in sorted(remaining_chars, key=lambda c: c["char_idx"]): + char_idx = char["char_idx"] + added_to_line = False + + for i, (min_idx, max_idx) in enumerate(line_ranges): + if min_idx is not None and max_idx is not None and min_idx <= char_idx <= max_idx: + merged_lines[i]["chars"].append(char) + added_to_line = True + break + + if not added_to_line: + for i, (min_idx, max_idx) in reversed(list(enumerate(line_ranges))): + if min_idx is None or max_idx is None: + continue + if min_idx < char_idx and char_idx > max_idx: + merged_lines[i]["chars"].append(char) + added_to_line = True + break + + if not added_to_line: + print("Could not find line for char", char) + pass + + for line in merged_lines: + if line["chars"]: + line["chars"] = sorted(line["chars"], key=lambda c: c["char_idx"]) + + min_x1 = min(char["bbox"][0] for char in line["chars"]) + min_y1 = min(char["bbox"][1] for char in line["chars"]) + max_x2 = max(char["bbox"][2] for char in line["chars"]) + max_y2 = max(char["bbox"][3] for char in line["chars"]) + line["bbox"] = [min_x1, min_y1, max_x2, max_y2] + return merged_lines