From df6f8fcf8f7b7d938ad0300155ef5752bf4e1612 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 3 May 2024 14:14:05 -0700
Subject: [PATCH 1/3] Improve table recognition and equation insertion

---
 marker/cleaners/fontstyle.py    |   4 ++
 marker/cleaners/table.py        | 106 ++++++++++++++++++++++++++------
 marker/equations/equations.py   |  46 +++++++++++---
 marker/ocr/heuristics.py        |   2 +-
 marker/ocr/recognition.py       |   4 +-
 marker/pdf/extract_text.py      |   9 ++-
 marker/postprocessors/images.py |   0
 marker/schema/block.py          |  12 ++++
 8 files changed, 149 insertions(+), 34 deletions(-)
 create mode 100644 marker/postprocessors/images.py

diff --git a/marker/cleaners/fontstyle.py b/marker/cleaners/fontstyle.py
index a92d8bd6..2f4a6185 100644
--- a/marker/cleaners/fontstyle.py
+++ b/marker/cleaners/fontstyle.py
@@ -20,6 +20,10 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
                         span.italic = True
 
                     font_weights.append(span.font_weight)
+
+    if len(font_weights) == 0:
+        return
+
     font_weights = np.array(font_weights)
     bold_thresh = np.percentile(font_weights, 90)
     bold_thresh_lower = np.percentile(font_weights, 75)
diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py
index bfb0e200..2a5f36ac 100644
--- a/marker/cleaners/table.py
+++ b/marker/cleaners/table.py
@@ -15,10 +15,17 @@ def replace_dots(text):
     return text
 
 
+def replace_newlines(text):
+    # Replace all newlines
+    newline_pattern = re.compile(r'[\r\n]+')
+    return newline_pattern.sub(' ', text.strip())
+
+
 def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
     table_rows = []
-    row_y_coord = None
     table_row = []
+    x_position = None
+    y_position = None
     for block_idx, block in enumerate(page.blocks):
         for line_idx, line in enumerate(block.lines):
             line_bbox = line.bbox
@@ -26,30 +33,81 @@ def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
             if intersect_pct < .5 or len(line.spans) == 0:
                 continue
             normed_y_start = line_bbox[1] / page.height
-            if row_y_coord is None or abs(normed_y_start - row_y_coord) < y_tol:
-                table_row.extend([s.text for s in line.spans])
+            normed_x_start = line_bbox[0] / page.width
+            normed_x_end = line_bbox[2] / page.width
+
+            cells = [[s.bbox, s.text] for s in line.spans]
+            if x_position is None or (normed_x_start > x_position and abs(normed_y_start - y_position) < y_tol):
+                # Same row
+                table_row.extend(cells)
             else:
-                table_rows.append(table_row)
-                table_row = [s.text for s in line.spans]
-            row_y_coord = normed_y_start
+                # New row
+                if len(table_row) > 0:
+                    table_rows.append(table_row)
+                table_row = cells
+            y_position = normed_y_start
+            x_position = normed_x_end
     if len(table_row) > 0:
         table_rows.append(table_row)
+    table_rows = assign_cells_to_columns(table_rows)
     return table_rows
 
 
-def get_table_pdftext(page: Page, table_box) -> List[List[str]]:
+def assign_cells_to_columns(rows, round_factor=4, tolerance=4):
+    left_edges = []
+    right_edges = []
+    centers = []
+
+    for row in rows:
+        for cell in row:
+            left_edges.append(cell[0][0] / round_factor * round_factor)
+            right_edges.append(cell[0][2] / round_factor * round_factor)
+            centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor)
+
+    unique_left = sorted(list(set(left_edges)))
+    unique_right = sorted(list(set(right_edges)))
+    unique_center = sorted(list(set(centers)))
+
+    # Find list with minimum length
+    separators = min([unique_left, unique_right, unique_center], key=len)
+
+    new_rows = []
+    for row in rows:
+        new_row = {}
+        last_col_index = -1
+        for cell in row:
+            left_edge = cell[0][0]
+            column_index = -1
+            for i, separator in enumerate(separators):
+                if left_edge - tolerance < separator and last_col_index < i:
+                    column_index = i
+                    break
+            if column_index == -1:
+                column_index = cell[0][0] # Assign a new column
+            new_row[column_index] = cell[1]
+            last_col_index = column_index
+
+        flat_row = [cell[1] for cell in sorted(new_row.items())]
+        min_column_index = min(new_row.keys())
+        flat_row = [""] * min_column_index + flat_row
+        new_rows.append(flat_row)
+
+    return new_rows
+
+
+def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     page_width = page.width
     table_rows = []
+    table_cell = ""
+    cell_bbox = None
+    prev_end = None
+    table_row = []
     for block_idx, block in enumerate(page.char_blocks):
         for line_idx, line in enumerate(block["lines"]):
             line_bbox = line["bbox"]
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5:
                 continue
-            prev_end = None
-            table_row = []
-            table_cell = ""
-            cell_bbox = None
             for span in line["spans"]:
                 for char in span["chars"]:
                     x_start, y_start, x_end, y_end = char["bbox"]
@@ -60,18 +118,28 @@ def get_table_pdftext(page: Page, table_box) -> List[List[str]]:
 
                     x_start /= page_width
                     x_end /= page_width
-                    if prev_end is None or x_start - prev_end < .01:
+                    cell_content = replace_dots(replace_newlines(table_cell))
+                    if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell
                         table_cell += char["char"]
-                    else:
-                        table_row.append(replace_dots(table_cell.strip()))
+                    elif x_start > prev_end - space_tol: # Check if we are on the same line
+                        if len(table_cell) > 0:
+                            table_row.append((cell_bbox, cell_content))
                         table_cell = char["char"]
                         cell_bbox = char["bbox"]
+                    else: # New line and cell
+                        if len(table_cell) > 0:
+                            table_row.append((cell_bbox, cell_content))
+                        table_cell = char["char"]
+                        cell_bbox = char["bbox"]
+                        if len(table_row) > 0:
+                            table_rows.append(table_row)
+                        table_row = []
                     prev_end = x_end
-                if len(table_cell) > 0:
-                    table_row.append(replace_dots(table_cell.strip()))
-                    table_cell = ""
-            if len(table_row) > 0:
-                table_rows.append(table_row)
+    if len(table_cell) > 0:
+        table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
+    if len(table_row) > 0:
+        table_rows.append(table_row)
+    table_rows = assign_cells_to_columns(table_rows)
     return table_rows
 
 
diff --git a/marker/equations/equations.py b/marker/equations/equations.py
index d0246f60..da23b136 100644
--- a/marker/equations/equations.py
+++ b/marker/equations/equations.py
@@ -7,7 +7,7 @@
 from marker.equations.inference import get_total_texify_tokens, get_latex_batched
 from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
-from marker.schema.block import Line, Span, Block, bbox_from_lines
+from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
 from marker.settings import settings
 
 
@@ -28,11 +28,7 @@ def find_equation_blocks(page, processor):
                     equation_lines[region_idx].append(line)
 
                     if region_idx not in insert_points:
-                        # Insert before the block if line is at the beginning of the block, otherwise after the block
-                        if line_idx <= len(block.lines) // 2:
-                            insert_points[region_idx] = block_idx
-                        else:
-                            insert_points[region_idx] = block_idx + 1
+                        insert_points[region_idx] = (block_idx, line_idx)
 
     block_lines_to_remove = defaultdict(set)
     for region_idx, equation_region in enumerate(equation_regions):
@@ -44,8 +40,13 @@ def find_equation_blocks(page, processor):
         equation_bbox = bbox_from_lines(equation_block)
 
         total_tokens = get_total_texify_tokens(block_text, processor)
-        selected_blocks = (equation_insert, total_tokens, block_text, equation_bbox)
+        equation_insert_line_idx = equation_insert[1]
+        equation_insert_line_idx -= len(
+            [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
+
+        selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
         if total_tokens < settings.TEXIFY_MODEL_MAX:
+            # Account for the lines we're about to remove
             for item in lines_to_remove[region_idx]:
                 block_lines_to_remove[item[0]].add(item[1])
             equation_blocks.append(selected_blocks)
@@ -58,12 +59,19 @@ def find_equation_blocks(page, processor):
     return equation_blocks
 
 
+def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count):
+    for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+        if block_idx >= insert_block_idx:
+            page_equation_blocks[idx][0] += insert_count
+
+
 def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor):
     converted_spans = []
     idx = 0
     success_count = 0
     fail_count = 0
-    for block_number, (insert_point, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+    total_inserted = 0
+    for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
         latex_text = predictions[block_number]
         conditions = [
             get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX,  # Make sure we didn't get to the overall token max, indicates run-on
@@ -97,7 +105,25 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
             new_block.lines[0].spans[0].text = latex_text
             converted_spans.append(deepcopy(new_block.lines[0].spans[0]))
 
-        page_blocks.blocks.insert(insert_point, new_block)
+        # Add in the new LaTeX block
+        if insert_line_idx == 0:
+            page_blocks.blocks.insert(insert_block_idx, new_block)
+            increment_insert_points(page_equation_blocks, insert_block_idx, 1)
+        elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines):
+            page_blocks.blocks.insert(insert_block_idx + 1, new_block)
+            increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1)
+        else:
+            new_blocks = []
+            for block_idx, block in enumerate(page_blocks.blocks):
+                if block_idx == insert_block_idx:
+                    split_block = split_block_lines(block, insert_line_idx)
+                    new_blocks.append(split_block[0])
+                    new_blocks.append(new_block)
+                    new_blocks.append(split_block[1])
+                    increment_insert_points(page_equation_blocks, insert_block_idx, 2)
+                else:
+                    new_blocks.append(block)
+            page_blocks.blocks = new_blocks
 
     return success_count, fail_count, converted_spans
 
@@ -117,7 +143,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
     token_counts = []
     for page_idx, page_equation_blocks in enumerate(equation_blocks):
         page_obj = doc[page_idx]
-        for equation_idx, (insert_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
+        for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
             png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
 
             images.append(png_image)
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
index ffe6e422..b83d5566 100644
--- a/marker/ocr/heuristics.py
+++ b/marker/ocr/heuristics.py
@@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6)
         total_intersection = 0
         for block in page.blocks:
             for line in block.lines:
-                intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
+                intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
                 total_intersection += intersection_pct
         if total_intersection > intersect_thresh:
             found_lines += 1
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
index d62624b8..535f6507 100644
--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -120,8 +120,8 @@ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
         out_pdf,
         language=langs[0],
         output_type="pdf",
-        redo_ocr=None if settings.OCR_ALL_PAGES else True,
-        force_ocr=True if settings.OCR_ALL_PAGES else None,
+        redo_ocr=None,
+        force_ocr=True,
         progress_bar=False,
         optimize=False,
         fast_web_view=1e6,
diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
index 1512643c..56871d6a 100644
--- a/marker/pdf/extract_text.py
+++ b/marker/pdf/extract_text.py
@@ -25,7 +25,7 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
                 block_text = s["text"].rstrip("\n")
                 block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
                 span_obj = Span(
-                    text=block_text.rstrip("\n"), # Remove end of line newlines, not spaces
+                    text=block_text, # Remove end of line newlines, not spaces
                     bbox=s["bbox"],
                     span_id=f"{pnum}_{span_id}",
                     font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
@@ -49,10 +49,15 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
         # Only select blocks with lines
         if len(block_lines) > 0:
             page_blocks.append(block_obj)
+
+    page_bbox = page["bbox"]
+    page_width = abs(page_bbox[2] - page_bbox[0])
+    page_height = abs(page_bbox[3] - page_bbox[1])
+    page_bbox = [0, 0, page_width, page_height]
     out_page = Page(
         blocks=page_blocks,
         pnum=page["page"],
-        bbox=page["bbox"],
+        bbox=page_bbox,
         rotation=page["rotation"],
         char_blocks=page["blocks"]
     )
diff --git a/marker/postprocessors/images.py b/marker/postprocessors/images.py
new file mode 100644
index 00000000..e69de29b
diff --git a/marker/schema/block.py b/marker/schema/block.py
index df4f90e8..1220b698 100644
--- a/marker/schema/block.py
+++ b/marker/schema/block.py
@@ -86,3 +86,15 @@ def bbox_from_lines(lines: List[Line]):
     max_x = max([line.bbox[2] for line in lines])
     max_y = max([line.bbox[3] for line in lines])
     return [min_x, min_y, max_x, max_y]
+
+
+def split_block_lines(block: Block, split_line_idx: int):
+    new_blocks = []
+    if split_line_idx >= len(block.lines):
+        return [block]
+    elif split_line_idx == 0:
+        return [block]
+    else:
+        new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum))
+        new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum))
+    return new_blocks

From c22d32e75f2398cd7977262dcbd98759fed8dca7 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 3 May 2024 14:33:49 -0700
Subject: [PATCH 2/3] Sort character blocks for pdf text

---
 marker/cleaners/table.py          | 30 +++++++++++++++++++++++-------
 marker/cleaners/text.py           |  8 ++++++++
 marker/convert.py                 |  6 ++++--
 marker/postprocessors/markdown.py |  2 ++
 4 files changed, 37 insertions(+), 9 deletions(-)
 create mode 100644 marker/cleaners/text.py

diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py
index 2a5f36ac..84e0c674 100644
--- a/marker/cleaners/table.py
+++ b/marker/cleaners/table.py
@@ -6,6 +6,23 @@
 import re
 
 
+def sort_char_blocks(blocks, tolerance=1.25):
+    vertical_groups = {}
+    for block in blocks:
+        group_key = round(block["bbox"][1] / tolerance) * tolerance
+        if group_key not in vertical_groups:
+            vertical_groups[group_key] = []
+        vertical_groups[group_key].append(block)
+
+    # Sort each group horizontally and flatten the groups into a single list
+    sorted_blocks = []
+    for _, group in sorted(vertical_groups.items()):
+        sorted_group = sorted(group, key=lambda x: x["bbox"][0])
+        sorted_blocks.extend(sorted_group)
+
+    return sorted_blocks
+
+
 def replace_dots(text):
     dot_pattern = re.compile(r'(\s*\.\s*){4,}')
     dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
@@ -21,23 +38,21 @@ def replace_newlines(text):
     return newline_pattern.sub(' ', text.strip())
 
 
-def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
+def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
     table_rows = []
     table_row = []
     x_position = None
-    y_position = None
     for block_idx, block in enumerate(page.blocks):
         for line_idx, line in enumerate(block.lines):
             line_bbox = line.bbox
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5 or len(line.spans) == 0:
                 continue
-            normed_y_start = line_bbox[1] / page.height
             normed_x_start = line_bbox[0] / page.width
             normed_x_end = line_bbox[2] / page.width
 
             cells = [[s.bbox, s.text] for s in line.spans]
-            if x_position is None or (normed_x_start > x_position and abs(normed_y_start - y_position) < y_tol):
+            if x_position is None or normed_x_start > x_position - space_tol:
                 # Same row
                 table_row.extend(cells)
             else:
@@ -45,7 +60,6 @@ def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
                 if len(table_row) > 0:
                     table_rows.append(table_row)
                 table_row = cells
-            y_position = normed_y_start
             x_position = normed_x_end
     if len(table_row) > 0:
         table_rows.append(table_row)
@@ -102,8 +116,10 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     cell_bbox = None
     prev_end = None
     table_row = []
-    for block_idx, block in enumerate(page.char_blocks):
-        for line_idx, line in enumerate(block["lines"]):
+    sorted_char_blocks = sort_char_blocks(page.char_blocks)
+    for block_idx, block in enumerate(sorted_char_blocks):
+        sorted_block_lines = sort_char_blocks(block["lines"])
+        for line_idx, line in enumerate(sorted_block_lines):
             line_bbox = line["bbox"]
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5:
diff --git a/marker/cleaners/text.py b/marker/cleaners/text.py
new file mode 100644
index 00000000..56870ec5
--- /dev/null
+++ b/marker/cleaners/text.py
@@ -0,0 +1,8 @@
+import re
+
+
+def cleanup_text(full_text):
+    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
+    full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
+    full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
+    return full_text
\ No newline at end of file
diff --git a/marker/convert.py b/marker/convert.py
index f5616b24..f3965be6 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -1,4 +1,7 @@
 import warnings
+
+from marker.cleaners.text import cleanup_text
+
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
@@ -131,8 +134,7 @@ def convert_single_pdf(
     full_text = get_full_text(text_blocks)
 
     # Handle empty blocks being joined
-    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
-    full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
+    full_text = cleanup_text(full_text)
 
     # Replace bullet characters with a -
     full_text = replace_bullets(full_text)
diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
index da9a914d..a0fada3d 100644
--- a/marker/postprocessors/markdown.py
+++ b/marker/postprocessors/markdown.py
@@ -103,6 +103,8 @@ def line_separator(line1, line2, block_type, is_continuation=False):
         return line1 + "\n\n" + line2
     elif block_type == "Formula":
         return line1 + " " + line2
+    elif block_type == "Table":
+        return line1 + "\n\n" + line2
     else:
         return line1 + "\n" + line2
 

From 9086dd5153a2d21401e5be93dc70fd4d086c976c Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 3 May 2024 16:30:30 -0700
Subject: [PATCH 3/3] Improve table, markdown, and ocr

---
 README.md                         |   2 +-
 marker/cleaners/table.py          |  20 ++--
 marker/ocr/heuristics.py          |   2 +-
 marker/ocr/recognition.py         |   2 +-
 marker/pdf/extract_text.py        |   6 +-
 marker/postprocessors/markdown.py |  23 ++--
 marker/settings.py                |   6 +-
 poetry.lock                       | 178 ++++++++++++++----------------
 pyproject.toml                    |   1 +
 9 files changed, 119 insertions(+), 121 deletions(-)

diff --git a/README.md b/README.md
index 396433de..f31beba6 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ First, some configuration.  Note that settings can be overridden with env vars,
 - Your torch device will be automatically detected, but you can manually set it also.  For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
   - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
   - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
-- By default, marker will use `ocrmypdf` for OCR on CPU, and `surya` on GPU.  Surya is slower on CPU, but more accurate. `ocrmypdf` also requires external dependencies (see above). You can override the default with the `OCR_ENGINE` setting.
+- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
 - Inspect the other settings in `marker/settings.py`.  You can override any settings in the `local.env` file, or by setting environment variables.
 
 
diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py
index 84e0c674..fe33e0e0 100644
--- a/marker/cleaners/table.py
+++ b/marker/cleaners/table.py
@@ -6,10 +6,14 @@
 import re
 
 
-def sort_char_blocks(blocks, tolerance=1.25):
+def sort_table_blocks(blocks, tolerance=5):
     vertical_groups = {}
     for block in blocks:
-        group_key = round(block["bbox"][1] / tolerance) * tolerance
+        if hasattr(block, "bbox"):
+            bbox = block.bbox
+        else:
+            bbox = block["bbox"]
+        group_key = round(bbox[1] / tolerance) * tolerance
         if group_key not in vertical_groups:
             vertical_groups[group_key] = []
         vertical_groups[group_key].append(block)
@@ -17,7 +21,7 @@ def sort_char_blocks(blocks, tolerance=1.25):
     # Sort each group horizontally and flatten the groups into a single list
     sorted_blocks = []
     for _, group in sorted(vertical_groups.items()):
-        sorted_group = sorted(group, key=lambda x: x["bbox"][0])
+        sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
         sorted_blocks.extend(sorted_group)
 
     return sorted_blocks
@@ -42,8 +46,10 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
     table_rows = []
     table_row = []
     x_position = None
-    for block_idx, block in enumerate(page.blocks):
-        for line_idx, line in enumerate(block.lines):
+    sorted_blocks = sort_table_blocks(page.blocks)
+    for block_idx, block in enumerate(sorted_blocks):
+        sorted_lines = sort_table_blocks(block.lines)
+        for line_idx, line in enumerate(sorted_lines):
             line_bbox = line.bbox
             intersect_pct = box_intersection_pct(line_bbox, table_box)
             if intersect_pct < .5 or len(line.spans) == 0:
@@ -116,9 +122,9 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
     cell_bbox = None
     prev_end = None
     table_row = []
-    sorted_char_blocks = sort_char_blocks(page.char_blocks)
+    sorted_char_blocks = sort_table_blocks(page.char_blocks)
     for block_idx, block in enumerate(sorted_char_blocks):
-        sorted_block_lines = sort_char_blocks(block["lines"])
+        sorted_block_lines = sort_table_blocks(block["lines"])
         for line_idx, line in enumerate(sorted_block_lines):
             line_bbox = line["bbox"]
             intersect_pct = box_intersection_pct(line_bbox, table_box)
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
index b83d5566..2fdb9d8e 100644
--- a/marker/ocr/heuristics.py
+++ b/marker/ocr/heuristics.py
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
 
 
-def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
+def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:
 
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
index 535f6507..6da62d8d 100644
--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -41,7 +41,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
             ocr_success += 1
             pages[orig_idx] = page
 
-    return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
+    return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
 
 
 def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
index 56871d6a..bf10e906 100644
--- a/marker/pdf/extract_text.py
+++ b/marker/pdf/extract_text.py
@@ -22,7 +22,11 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
         for l in block["lines"]:
             spans = []
             for i, s in enumerate(l["spans"]):
-                block_text = s["text"].rstrip("\n")
+                block_text = s["text"]
+                # Remove trailing newlines and carriage returns (tesseract)
+                while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
+                    block_text = block_text[:-1]
+
                 block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
                 span_obj = Span(
                     text=block_text, # Remove end of line newlines, not spaces
diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
index a0fada3d..6d9e37f1 100644
--- a/marker/postprocessors/markdown.py
+++ b/marker/postprocessors/markdown.py
@@ -1,6 +1,7 @@
 from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
 from marker.schema.page import Page
 import re
+import regex
 from typing import List
 
 
@@ -80,31 +81,35 @@ def block_surround(text, block_type):
 
 def line_separator(line1, line2, block_type, is_continuation=False):
     # Should cover latin-derived languages and russian
-    lowercase_letters = "a-zà-öø-ÿа-яşćăâđêôơưþðæøå"
-    uppercase_letters = "A-ZÀ-ÖØ-ßА-ЯŞĆĂÂĐÊÔƠƯÞÐÆØÅ"
+    lowercase_letters = r'(\p{Lo}+|\p{Ll}+)'
     # Remove hyphen in current line if next line and current line appear to be joined
-    hyphen_pattern = re.compile(rf'.*[{lowercase_letters}][-]\s?$', re.DOTALL)
-    if line1 and hyphen_pattern.match(line1) and re.match(rf"^\s?[{lowercase_letters}]", line2):
+    hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL)
+    if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
         # Split on — or - from the right
         line1 = re.split(r"[-—]\s?$", line1)[0]
         return line1.rstrip() + line2.lstrip()
 
-    lowercase_pattern1 = re.compile(rf'.*[{lowercase_letters},]\s?$', re.DOTALL)
-    lowercase_pattern2 = re.compile(rf'^\s?[{uppercase_letters}{lowercase_letters}]', re.DOTALL)
-    end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL)
+    all_letters = r'\p{L}+'
+    sentence_continuations = r',;(—'
+    sentence_ends = r'。ๆ.?!'
+    line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL)
+    line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
+    sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)
 
     if block_type in ["Title", "Section-header"]:
         return line1.rstrip() + " " + line2.lstrip()
-    elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2) and block_type == "Text":
+    elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text":
         return line1.rstrip() + " " + line2.lstrip()
     elif is_continuation:
         return line1.rstrip() + " " + line2.lstrip()
-    elif block_type == "Text" and end_pattern.match(line1):
+    elif block_type == "Text" and sentence_end_pattern.match(line1):
         return line1 + "\n\n" + line2
     elif block_type == "Formula":
         return line1 + " " + line2
     elif block_type == "Table":
         return line1 + "\n\n" + line2
+    elif block_type in ["Formula"]:
+        return line1.rstrip() + "\n\n" + line2.lstrip()
     else:
         return line1 + "\n" + line2
 
diff --git a/marker/settings.py b/marker/settings.py
index 95652ae0..9bdc1490 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -61,11 +61,7 @@ def OCR_ENGINE_INTERNAL(self) -> str:
         if self.OCR_ENGINE is not None:
             return self.OCR_ENGINE
 
-        # Does not work with mps
-        if torch.cuda.is_available():
-            return "surya"
-
-        return "ocrmypdf"
+        return "surya"
 
     # Texify model
     TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
diff --git a/poetry.lock b/poetry.lock
index afb8b8f7..925699e8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3554,104 +3554,90 @@ rpds-py = ">=0.7.0"
 
 [[package]]
 name = "regex"
-version = "2023.12.25"
+version = "2024.4.28"
 description = "Alternative regular expression module, to replace re."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"},
-    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"},
-    {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"},
-    {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"},
-    {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"},
-    {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"},
-    {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"},
-    {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"},
-    {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"},
-    {file = "regex-2023.12.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:da695d75ac97cb1cd725adac136d25ca687da4536154cdc2815f576e4da11c69"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d126361607b33c4eb7b36debc173bf25d7805847346dd4d99b5499e1fef52bc7"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4719bb05094d7d8563a450cf8738d2e1061420f79cfcc1fa7f0a44744c4d8f73"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dd58946bce44b53b06d94aa95560d0b243eb2fe64227cba50017a8d8b3cd3e2"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22a86d9fff2009302c440b9d799ef2fe322416d2d58fc124b926aa89365ec482"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aae8101919e8aa05ecfe6322b278f41ce2994c4a430303c4cd163fef746e04f"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e692296c4cc2873967771345a876bcfc1c547e8dd695c6b89342488b0ea55cd8"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:263ef5cc10979837f243950637fffb06e8daed7f1ac1e39d5910fd29929e489a"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d6f7e255e5fa94642a0724e35406e6cb7001c09d476ab5fce002f652b36d0c39"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:88ad44e220e22b63b0f8f81f007e8abbb92874d8ced66f32571ef8beb0643b2b"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:3a17d3ede18f9cedcbe23d2daa8a2cd6f59fe2bf082c567e43083bba3fb00347"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d15b274f9e15b1a0b7a45d2ac86d1f634d983ca40d6b886721626c47a400bf39"},
-    {file = "regex-2023.12.25-cp37-cp37m-win32.whl", hash = "sha256:ed19b3a05ae0c97dd8f75a5d8f21f7723a8c33bbc555da6bbe1f96c470139d3c"},
-    {file = "regex-2023.12.25-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d1047952c0b8104a1d371f88f4ab62e6275567d4458c1e26e9627ad489b445"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b43523d7bc2abd757119dbfb38af91b5735eea45537ec6ec3a5ec3f9562a1c53"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:efb2d82f33b2212898f1659fb1c2e9ac30493ac41e4d53123da374c3b5541e64"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7fca9205b59c1a3d5031f7e64ed627a1074730a51c2a80e97653e3e9fa0d415"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086dd15e9435b393ae06f96ab69ab2d333f5d65cbe65ca5a3ef0ec9564dfe770"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e81469f7d01efed9b53740aedd26085f20d49da65f9c1f41e822a33992cb1590"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34e4af5b27232f68042aa40a91c3b9bb4da0eeb31b7632e0091afc4310afe6cb"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9852b76ab558e45b20bf1893b59af64a28bd3820b0c2efc80e0a70a4a3ea51c1"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff100b203092af77d1a5a7abe085b3506b7eaaf9abf65b73b7d6905b6cb76988"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cc038b2d8b1470364b1888a98fd22d616fba2b6309c5b5f181ad4483e0017861"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:094ba386bb5c01e54e14434d4caabf6583334090865b23ef58e0424a6286d3dc"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5cd05d0f57846d8ba4b71d9c00f6f37d6b97d5e5ef8b3c3840426a475c8f70f4"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9aa1a67bbf0f957bbe096375887b2505f5d8ae16bf04488e8b0f334c36e31360"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:98a2636994f943b871786c9e82bfe7883ecdaba2ef5df54e1450fa9869d1f756"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37f8e93a81fc5e5bd8db7e10e62dc64261bcd88f8d7e6640aaebe9bc180d9ce2"},
-    {file = "regex-2023.12.25-cp38-cp38-win32.whl", hash = "sha256:d78bd484930c1da2b9679290a41cdb25cc127d783768a0369d6b449e72f88beb"},
-    {file = "regex-2023.12.25-cp38-cp38-win_amd64.whl", hash = "sha256:b521dcecebc5b978b447f0f69b5b7f3840eac454862270406a39837ffae4e697"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f7bc09bc9c29ebead055bcba136a67378f03d66bf359e87d0f7c759d6d4ffa31"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e14b73607d6231f3cc4622809c196b540a6a44e903bcfad940779c80dffa7be7"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9eda5f7a50141291beda3edd00abc2d4a5b16c29c92daf8d5bd76934150f3edc"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"},
-    {file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"},
-    {file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"},
-    {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
+    {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
+    {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
+    {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
+    {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
+    {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
+    {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
+    {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
+    {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
+    {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
+    {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
+    {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
 ]
 
 [[package]]
@@ -5004,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
-content-hash = "28d4e2918b3d71048f67e0569e7dfa5b729cc553c33e97bf371e3b8c04803fd8"
+content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb"
diff --git a/pyproject.toml b/pyproject.toml
index be9ecd3e..ff0c2ff4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ rapidfuzz = "^3.8.1"
 surya-ocr = "^0.4.0"
 filetype = "^1.2.0"
 pdftext = "^0.3.4"
+regex = "^2024.4.28"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"