VikParuchuri · VikParuchuri · May 4, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ First, some configuration.  Note that settings can be overridden with env vars,
 - Your torch device will be automatically detected, but you can manually set it also.  For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
   - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
   - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
-- By default, marker will use `ocrmypdf` for OCR on CPU, and `surya` on GPU.  Surya is slower on CPU, but more accurate. `ocrmypdf` also requires external dependencies (see above). You can override the default with the `OCR_ENGINE` setting.
+- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
 - Inspect the other settings in `marker/settings.py`.  You can override any settings in the `local.env` file, or by setting environment variables.
 
 

diff --git a/marker/cleaners/fontstyle.py b/marker/cleaners/fontstyle.py
@@ -20,6 +20,10 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
                         span.italic = True
 
                     font_weights.append(span.font_weight)
+
+    if len(font_weights) == 0:
+        return
+
     font_weights = np.array(font_weights)
     bold_thresh = np.percentile(font_weights, 90)
     bold_thresh_lower = np.percentile(font_weights, 75)

diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py
@@ -6,6 +6,27 @@
 import re
 
 
+def sort_table_blocks(blocks, tolerance=5):
+    vertical_groups = {}
+    for block in blocks:
+        if hasattr(block, "bbox"):
+            bbox = block.bbox
+        else:
+            bbox = block["bbox"]
+        group_key = round(bbox[1] / tolerance) * tolerance
+        if group_key not in vertical_groups:
+            vertical_groups[group_key] = []
+        vertical_groups[group_key].append(block)
+
+    # Sort each group horizontally and flatten the groups into a single list
+    sorted_blocks = []
+    for _, group in sorted(vertical_groups.items()):
+        sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
+        sorted_blocks.extend(sorted_group)
+
+    return sorted_blocks
+
+
 def replace_dots(text):
     dot_pattern = re.compile(r'(\s*\.\s*){4,}')
     dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
@@ -15,41 +36,100 @@ def replace_dots(text):
     return text
 
 
-def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
+def replace_newlines(text):
+    # Replace all newlines
+    newline_pattern = re.compile(r'[\r\n]+')
+    return newline_pattern.sub(' ', text.strip())
+
+
+def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
     table_rows = []
-    row_y_coord = None
     table_row = []
-    for block_idx, block in enumerate(page.blocks):
-        for line_idx, line in enumerate(block.lines):
+    x_position = None
+    sorted_blocks = sort_table_blocks(page.blocks)
+    for block_idx, block in enumerate(sorted_blocks):
+        sorted_lines = sort_table_blocks(block.lines)
+        for line_idx, line in enumerate(sorted_lines):
             line_bbox = line.bbox
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5 or len(line.spans) == 0:
                 continue
-            normed_y_start = line_bbox[1] / page.height
-            if row_y_coord is None or abs(normed_y_start - row_y_coord) < y_tol:
-                table_row.extend([s.text for s in line.spans])
+            normed_x_start = line_bbox[0] / page.width
+            normed_x_end = line_bbox[2] / page.width
+
+            cells = [[s.bbox, s.text] for s in line.spans]
+            if x_position is None or normed_x_start > x_position - space_tol:
+                # Same row
+                table_row.extend(cells)
             else:
-                table_rows.append(table_row)
-                table_row = [s.text for s in line.spans]
-            row_y_coord = normed_y_start
+                # New row
+                if len(table_row) > 0:
+                    table_rows.append(table_row)
+                table_row = cells
+            x_position = normed_x_end
     if len(table_row) > 0:
         table_rows.append(table_row)
+    table_rows = assign_cells_to_columns(table_rows)
     return table_rows
 
 
-def get_table_pdftext(page: Page, table_box) -> List[List[str]]:
+def assign_cells_to_columns(rows, round_factor=4, tolerance=4):
+    left_edges = []
+    right_edges = []
+    centers = []
+
+    for row in rows:
+        for cell in row:
+            left_edges.append(cell[0][0] / round_factor * round_factor)
+            right_edges.append(cell[0][2] / round_factor * round_factor)
+            centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor)
+
+    unique_left = sorted(list(set(left_edges)))
+    unique_right = sorted(list(set(right_edges)))
+    unique_center = sorted(list(set(centers)))
+
+    # Find list with minimum length
+    separators = min([unique_left, unique_right, unique_center], key=len)
+
+    new_rows = []
+    for row in rows:
+        new_row = {}
+        last_col_index = -1
+        for cell in row:
+            left_edge = cell[0][0]
+            column_index = -1
+            for i, separator in enumerate(separators):
+                if left_edge - tolerance < separator and last_col_index < i:
+                    column_index = i
+                    break
+            if column_index == -1:
+                column_index = cell[0][0] # Assign a new column
+            new_row[column_index] = cell[1]
+            last_col_index = column_index
+
+        flat_row = [cell[1] for cell in sorted(new_row.items())]
+        min_column_index = min(new_row.keys())
+        flat_row = [""] * min_column_index + flat_row
+        new_rows.append(flat_row)
+
+    return new_rows
+
+
+def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     page_width = page.width
     table_rows = []
-    for block_idx, block in enumerate(page.char_blocks):
-        for line_idx, line in enumerate(block["lines"]):
+    table_cell = ""
+    cell_bbox = None
+    prev_end = None
+    table_row = []
+    sorted_char_blocks = sort_table_blocks(page.char_blocks)
+    for block_idx, block in enumerate(sorted_char_blocks):
+        sorted_block_lines = sort_table_blocks(block["lines"])
+        for line_idx, line in enumerate(sorted_block_lines):
             line_bbox = line["bbox"]
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5:
                 continue
-            prev_end = None
-            table_row = []
-            table_cell = ""
-            cell_bbox = None
             for span in line["spans"]:
                 for char in span["chars"]:
                     x_start, y_start, x_end, y_end = char["bbox"]
@@ -60,18 +140,28 @@ def get_table_pdftext(page: Page, table_box) -> List[List[str]]:
 
                     x_start /= page_width
                     x_end /= page_width
-                    if prev_end is None or x_start - prev_end < .01:
+                    cell_content = replace_dots(replace_newlines(table_cell))
+                    if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell
                         table_cell += char["char"]
-                    else:
-                        table_row.append(replace_dots(table_cell.strip()))
+                    elif x_start > prev_end - space_tol: # Check if we are on the same line
+                        if len(table_cell) > 0:
+                            table_row.append((cell_bbox, cell_content))
+                        table_cell = char["char"]
+                        cell_bbox = char["bbox"]
+                    else: # New line and cell
+                        if len(table_cell) > 0:
+                            table_row.append((cell_bbox, cell_content))
                         table_cell = char["char"]
                         cell_bbox = char["bbox"]
+                        if len(table_row) > 0:
+                            table_rows.append(table_row)
+                        table_row = []
                     prev_end = x_end
-                if len(table_cell) > 0:
-                    table_row.append(replace_dots(table_cell.strip()))
-                    table_cell = ""
-            if len(table_row) > 0:
-                table_rows.append(table_row)
+    if len(table_cell) > 0:
+        table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
+    if len(table_row) > 0:
+        table_rows.append(table_row)
+    table_rows = assign_cells_to_columns(table_rows)
     return table_rows
 
 

diff --git a/marker/cleaners/text.py b/marker/cleaners/text.py
@@ -0,0 +1,8 @@
+import re
+
+
+def cleanup_text(full_text):
+    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
+    full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
+    full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
+    return full_text
diff --git a/marker/convert.py b/marker/convert.py
@@ -1,4 +1,7 @@
 import warnings
+
+from marker.cleaners.text import cleanup_text
+
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
@@ -131,8 +134,7 @@ def convert_single_pdf(
     full_text = get_full_text(text_blocks)
 
     # Handle empty blocks being joined
-    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
-    full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
+    full_text = cleanup_text(full_text)
 
     # Replace bullet characters with a -
     full_text = replace_bullets(full_text)

diff --git a/marker/equations/equations.py b/marker/equations/equations.py
@@ -7,7 +7,7 @@
 from marker.equations.inference import get_total_texify_tokens, get_latex_batched
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
-from marker.schema.block import Line, Span, Block, bbox_from_lines
+from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
 from marker.settings import settings
 
 
@@ -28,11 +28,7 @@ def find_equation_blocks(page, processor):
                     equation_lines[region_idx].append(line)
 
                     if region_idx not in insert_points:
-                        # Insert before the block if line is at the beginning of the block, otherwise after the block
-                        if line_idx <= len(block.lines) // 2:
-                            insert_points[region_idx] = block_idx
-                        else:
-                            insert_points[region_idx] = block_idx + 1
+                        insert_points[region_idx] = (block_idx, line_idx)
 
     block_lines_to_remove = defaultdict(set)
     for region_idx, equation_region in enumerate(equation_regions):
@@ -44,8 +40,13 @@ def find_equation_blocks(page, processor):
         equation_bbox = bbox_from_lines(equation_block)
 
         total_tokens = get_total_texify_tokens(block_text, processor)
-        selected_blocks = (equation_insert, total_tokens, block_text, equation_bbox)
+        equation_insert_line_idx = equation_insert[1]
+        equation_insert_line_idx -= len(
+            [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
+
+        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
         if total_tokens < settings.TEXIFY_MODEL_MAX:
+            # Account for the lines we're about to remove
             for item in lines_to_remove[region_idx]:
                 block_lines_to_remove[item[0]].add(item[1])
             equation_blocks.append(selected_blocks)
@@ -58,12 +59,19 @@ def find_equation_blocks(page, processor):
     return equation_blocks
 
 
+def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count):
+    for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+        if block_idx >= insert_block_idx:
+            page_equation_blocks[idx][0] += insert_count
+
+
 def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor):
     converted_spans = []
     idx = 0
     success_count = 0
     fail_count = 0
-    for block_number, (insert_point, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+    total_inserted = 0
+    for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
         latex_text = predictions[block_number]
         conditions = [
             get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX,  # Make sure we didn't get to the overall token max, indicates run-on
@@ -97,7 +105,25 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
             new_block.lines[0].spans[0].text = latex_text
             converted_spans.append(deepcopy(new_block.lines[0].spans[0]))
 
-        page_blocks.blocks.insert(insert_point, new_block)
+        # Add in the new LaTeX block
+        if insert_line_idx == 0:
+            page_blocks.blocks.insert(insert_block_idx, new_block)
+            increment_insert_points(page_equation_blocks, insert_block_idx, 1)
+        elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines):
+            page_blocks.blocks.insert(insert_block_idx + 1, new_block)
+            increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1)
+        else:
+            new_blocks = []
+            for block_idx, block in enumerate(page_blocks.blocks):
+                if block_idx == insert_block_idx:
+                    split_block = split_block_lines(block, insert_line_idx)
+                    new_blocks.append(split_block[0])
+                    new_blocks.append(new_block)
+                    new_blocks.append(split_block[1])
+                    increment_insert_points(page_equation_blocks, insert_block_idx, 2)
+                else:
+                    new_blocks.append(block)
+            page_blocks.blocks = new_blocks
 
     return success_count, fail_count, converted_spans
 
@@ -117,7 +143,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
     token_counts = []
     for page_idx, page_equation_blocks in enumerate(equation_blocks):
         page_obj = doc[page_idx]
-        for equation_idx, (insert_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+        for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
             png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
 
             images.append(png_image)

diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
 
 
-def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
+def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:
 
@@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6)
         total_intersection = 0
         for block in page.blocks:
             for line in block.lines:
-                intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
+                intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
                 total_intersection += intersection_pct
         if total_intersection > intersect_thresh:
             found_lines += 1

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -41,7 +41,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
             ocr_success += 1
             pages[orig_idx] = page
 
-    return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
+    return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
 
 
 def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
@@ -120,8 +120,8 @@ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
         out_pdf,
         language=langs[0],
         output_type="pdf",
-        redo_ocr=None if settings.OCR_ALL_PAGES else True,
-        force_ocr=True if settings.OCR_ALL_PAGES else None,
+        redo_ocr=None,
+        force_ocr=True,
         progress_bar=False,
         optimize=False,
         fast_web_view=1e6,

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -22,10 +22,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
         for l in block["lines"]:
             spans = []
             for i, s in enumerate(l["spans"]):
-                block_text = s["text"].rstrip("\n")
+                block_text = s["text"]
+                # Remove trailing newlines and carriage returns (tesseract)
+                while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
+                    block_text = block_text[:-1]
+
                 block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
                 span_obj = Span(
-                    text=block_text.rstrip("\n"), # Remove end of line newlines, not spaces
+                    text=block_text, # Remove end of line newlines, not spaces
                     bbox=s["bbox"],
                     span_id=f"{pnum}_{span_id}",
                     font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
@@ -49,10 +53,15 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
         # Only select blocks with lines
         if len(block_lines) > 0:
             page_blocks.append(block_obj)
+
+    page_bbox = page["bbox"]
+    page_width = abs(page_bbox[2] - page_bbox[0])
+    page_height = abs(page_bbox[3] - page_bbox[1])
+    page_bbox = [0, 0, page_width, page_height]
     out_page = Page(
         blocks=page_blocks,
         pnum=page["page"],
-        bbox=page["bbox"],
+        bbox=page_bbox,
         rotation=page["rotation"],
         char_blocks=page["blocks"]
     )

diff --git a/marker/postprocessors/images.py b/marker/postprocessors/images.py