From df6f8fcf8f7b7d938ad0300155ef5752bf4e1612 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 3 May 2024 14:14:05 -0700 Subject: [PATCH 1/3] Improve table recognition and equation insertion --- marker/cleaners/fontstyle.py | 4 ++ marker/cleaners/table.py | 106 ++++++++++++++++++++++++++------ marker/equations/equations.py | 46 +++++++++++--- marker/ocr/heuristics.py | 2 +- marker/ocr/recognition.py | 4 +- marker/pdf/extract_text.py | 9 ++- marker/postprocessors/images.py | 0 marker/schema/block.py | 12 ++++ 8 files changed, 149 insertions(+), 34 deletions(-) create mode 100644 marker/postprocessors/images.py diff --git a/marker/cleaners/fontstyle.py b/marker/cleaners/fontstyle.py index a92d8bd6..2f4a6185 100644 --- a/marker/cleaners/fontstyle.py +++ b/marker/cleaners/fontstyle.py @@ -20,6 +20,10 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550): span.italic = True font_weights.append(span.font_weight) + + if len(font_weights) == 0: + return + font_weights = np.array(font_weights) bold_thresh = np.percentile(font_weights, 90) bold_thresh_lower = np.percentile(font_weights, 75) diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py index bfb0e200..2a5f36ac 100644 --- a/marker/cleaners/table.py +++ b/marker/cleaners/table.py @@ -15,10 +15,17 @@ def replace_dots(text): return text +def replace_newlines(text): + # Replace all newlines + newline_pattern = re.compile(r'[\r\n]+') + return newline_pattern.sub(' ', text.strip()) + + def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]: table_rows = [] - row_y_coord = None table_row = [] + x_position = None + y_position = None for block_idx, block in enumerate(page.blocks): for line_idx, line in enumerate(block.lines): line_bbox = line.bbox @@ -26,30 +33,81 @@ def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]: if intersect_pct < .5 or len(line.spans) == 0: continue normed_y_start = line_bbox[1] / page.height - if row_y_coord is None or abs(normed_y_start - row_y_coord) < y_tol: - table_row.extend([s.text for s in line.spans]) + normed_x_start = line_bbox[0] / page.width + normed_x_end = line_bbox[2] / page.width + + cells = [[s.bbox, s.text] for s in line.spans] + if x_position is None or (normed_x_start > x_position and abs(normed_y_start - y_position) < y_tol): + # Same row + table_row.extend(cells) else: - table_rows.append(table_row) - table_row = [s.text for s in line.spans] - row_y_coord = normed_y_start + # New row + if len(table_row) > 0: + table_rows.append(table_row) + table_row = cells + y_position = normed_y_start + x_position = normed_x_end if len(table_row) > 0: table_rows.append(table_row) + table_rows = assign_cells_to_columns(table_rows) return table_rows -def get_table_pdftext(page: Page, table_box) -> List[List[str]]: +def assign_cells_to_columns(rows, round_factor=4, tolerance=4): + left_edges = [] + right_edges = [] + centers = [] + + for row in rows: + for cell in row: + left_edges.append(cell[0][0] / round_factor * round_factor) + right_edges.append(cell[0][2] / round_factor * round_factor) + centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor) + + unique_left = sorted(list(set(left_edges))) + unique_right = sorted(list(set(right_edges))) + unique_center = sorted(list(set(centers))) + + # Find list with minimum length + separators = min([unique_left, unique_right, unique_center], key=len) + + new_rows = [] + for row in rows: + new_row = {} + last_col_index = -1 + for cell in row: + left_edge = cell[0][0] + column_index = -1 + for i, separator in enumerate(separators): + if left_edge - tolerance < separator and last_col_index < i: + column_index = i + break + if column_index == -1: + column_index = cell[0][0] # Assign a new column + new_row[column_index] = cell[1] + last_col_index = column_index + + flat_row = [cell[1] for cell in sorted(new_row.items())] + min_column_index = min(new_row.keys()) + flat_row = [""] * min_column_index + flat_row + new_rows.append(flat_row) + + return new_rows + + +def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: page_width = page.width table_rows = [] + table_cell = "" + cell_bbox = None + prev_end = None + table_row = [] for block_idx, block in enumerate(page.char_blocks): for line_idx, line in enumerate(block["lines"]): line_bbox = line["bbox"] intersect_pct = box_intersection_pct(line_bbox, table_box) if intersect_pct < .5: continue - prev_end = None - table_row = [] - table_cell = "" - cell_bbox = None for span in line["spans"]: for char in span["chars"]: x_start, y_start, x_end, y_end = char["bbox"] @@ -60,18 +118,28 @@ def get_table_pdftext(page: Page, table_box) -> List[List[str]]: x_start /= page_width x_end /= page_width - if prev_end is None or x_start - prev_end < .01: + cell_content = replace_dots(replace_newlines(table_cell)) + if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell table_cell += char["char"] - else: - table_row.append(replace_dots(table_cell.strip())) + elif x_start > prev_end - space_tol: # Check if we are on the same line + if len(table_cell) > 0: + table_row.append((cell_bbox, cell_content)) table_cell = char["char"] cell_bbox = char["bbox"] + else: # New line and cell + if len(table_cell) > 0: + table_row.append((cell_bbox, cell_content)) + table_cell = char["char"] + cell_bbox = char["bbox"] + if len(table_row) > 0: + table_rows.append(table_row) + table_row = [] prev_end = x_end - if len(table_cell) > 0: - table_row.append(replace_dots(table_cell.strip())) - table_cell = "" - if len(table_row) > 0: - table_rows.append(table_row) + if len(table_cell) > 0: + table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell)))) + if len(table_row) > 0: + table_rows.append(table_row) + table_rows = assign_cells_to_columns(table_rows) return table_rows diff --git a/marker/equations/equations.py b/marker/equations/equations.py index d0246f60..da23b136 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -7,7 +7,7 @@ from marker.equations.inference import get_total_texify_tokens, get_latex_batched from marker.schema.bbox import rescale_bbox from marker.schema.page import Page -from marker.schema.block import Line, Span, Block, bbox_from_lines +from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines from marker.settings import settings @@ -28,11 +28,7 @@ def find_equation_blocks(page, processor): equation_lines[region_idx].append(line) if region_idx not in insert_points: - # Insert before the block if line is at the beginning of the block, otherwise after the block - if line_idx <= len(block.lines) // 2: - insert_points[region_idx] = block_idx - else: - insert_points[region_idx] = block_idx + 1 + insert_points[region_idx] = (block_idx, line_idx) block_lines_to_remove = defaultdict(set) for region_idx, equation_region in enumerate(equation_regions): @@ -44,8 +40,13 @@ def find_equation_blocks(page, processor): equation_bbox = bbox_from_lines(equation_block) total_tokens = get_total_texify_tokens(block_text, processor) - selected_blocks = (equation_insert, total_tokens, block_text, equation_bbox) + equation_insert_line_idx = equation_insert[1] + equation_insert_line_idx -= len( + [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]]) + + selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox] if total_tokens < settings.TEXIFY_MODEL_MAX: + # Account for the lines we're about to remove for item in lines_to_remove[region_idx]: block_lines_to_remove[item[0]].add(item[1]) equation_blocks.append(selected_blocks) @@ -58,12 +59,19 @@ def find_equation_blocks(page, processor): return equation_blocks +def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count): + for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): + if block_idx >= insert_block_idx: + page_equation_blocks[idx][0] += insert_count + + def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor): converted_spans = [] idx = 0 success_count = 0 fail_count = 0 - for block_number, (insert_point, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): + total_inserted = 0 + for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): latex_text = predictions[block_number] conditions = [ get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't get to the overall token max, indicates run-on @@ -97,7 +105,25 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu new_block.lines[0].spans[0].text = latex_text converted_spans.append(deepcopy(new_block.lines[0].spans[0])) - page_blocks.blocks.insert(insert_point, new_block) + # Add in the new LaTeX block + if insert_line_idx == 0: + page_blocks.blocks.insert(insert_block_idx, new_block) + increment_insert_points(page_equation_blocks, insert_block_idx, 1) + elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines): + page_blocks.blocks.insert(insert_block_idx + 1, new_block) + increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1) + else: + new_blocks = [] + for block_idx, block in enumerate(page_blocks.blocks): + if block_idx == insert_block_idx: + split_block = split_block_lines(block, insert_line_idx) + new_blocks.append(split_block[0]) + new_blocks.append(new_block) + new_blocks.append(split_block[1]) + increment_insert_points(page_equation_blocks, insert_block_idx, 2) + else: + new_blocks.append(block) + page_blocks.blocks = new_blocks return success_count, fail_count, converted_spans @@ -117,7 +143,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings. token_counts = [] for page_idx, page_equation_blocks in enumerate(equation_blocks): page_obj = doc[page_idx] - for equation_idx, (insert_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): + for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox) images.append(png_image) diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index ffe6e422..b83d5566 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6) total_intersection = 0 for block in page.blocks: for line in block.lines: - intersection_pct = box_intersection_pct(detected_bbox, line.bbox) + intersection_pct = box_intersection_pct(line.bbox, detected_bbox) total_intersection += intersection_pct if total_intersection > intersect_thresh: found_lines += 1 diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index d62624b8..535f6507 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -120,8 +120,8 @@ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]: out_pdf, language=langs[0], output_type="pdf", - redo_ocr=None if settings.OCR_ALL_PAGES else True, - force_ocr=True if settings.OCR_ALL_PAGES else None, + redo_ocr=None, + force_ocr=True, progress_bar=False, optimize=False, fast_web_view=1e6, diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 1512643c..56871d6a 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -25,7 +25,7 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: block_text = s["text"].rstrip("\n") block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks span_obj = Span( - text=block_text.rstrip("\n"), # Remove end of line newlines, not spaces + text=block_text, # Remove end of line newlines, not spaces bbox=s["bbox"], span_id=f"{pnum}_{span_id}", font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font @@ -49,10 +49,15 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: # Only select blocks with lines if len(block_lines) > 0: page_blocks.append(block_obj) + + page_bbox = page["bbox"] + page_width = abs(page_bbox[2] - page_bbox[0]) + page_height = abs(page_bbox[3] - page_bbox[1]) + page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], - bbox=page["bbox"], + bbox=page_bbox, rotation=page["rotation"], char_blocks=page["blocks"] ) diff --git a/marker/postprocessors/images.py b/marker/postprocessors/images.py new file mode 100644 index 00000000..e69de29b diff --git a/marker/schema/block.py b/marker/schema/block.py index df4f90e8..1220b698 100644 --- a/marker/schema/block.py +++ b/marker/schema/block.py @@ -86,3 +86,15 @@ def bbox_from_lines(lines: List[Line]): max_x = max([line.bbox[2] for line in lines]) max_y = max([line.bbox[3] for line in lines]) return [min_x, min_y, max_x, max_y] + + +def split_block_lines(block: Block, split_line_idx: int): + new_blocks = [] + if split_line_idx >= len(block.lines): + return [block] + elif split_line_idx == 0: + return [block] + else: + new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum)) + new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum)) + return new_blocks From c22d32e75f2398cd7977262dcbd98759fed8dca7 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 3 May 2024 14:33:49 -0700 Subject: [PATCH 2/3] Sort character blocks for pdf text --- marker/cleaners/table.py | 30 +++++++++++++++++++++++------- marker/cleaners/text.py | 8 ++++++++ marker/convert.py | 6 ++++-- marker/postprocessors/markdown.py | 2 ++ 4 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 marker/cleaners/text.py diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py index 2a5f36ac..84e0c674 100644 --- a/marker/cleaners/table.py +++ b/marker/cleaners/table.py @@ -6,6 +6,23 @@ import re +def sort_char_blocks(blocks, tolerance=1.25): + vertical_groups = {} + for block in blocks: + group_key = round(block["bbox"][1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x["bbox"][0]) + sorted_blocks.extend(sorted_group) + + return sorted_blocks + + def replace_dots(text): dot_pattern = re.compile(r'(\s*\.\s*){4,}') dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) @@ -21,23 +38,21 @@ def replace_newlines(text): return newline_pattern.sub(' ', text.strip()) -def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]: +def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: table_rows = [] table_row = [] x_position = None - y_position = None for block_idx, block in enumerate(page.blocks): for line_idx, line in enumerate(block.lines): line_bbox = line.bbox intersect_pct = box_intersection_pct(line_bbox, table_box) if intersect_pct < .5 or len(line.spans) == 0: continue - normed_y_start = line_bbox[1] / page.height normed_x_start = line_bbox[0] / page.width normed_x_end = line_bbox[2] / page.width cells = [[s.bbox, s.text] for s in line.spans] - if x_position is None or (normed_x_start > x_position and abs(normed_y_start - y_position) < y_tol): + if x_position is None or normed_x_start > x_position - space_tol: # Same row table_row.extend(cells) else: @@ -45,7 +60,6 @@ def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]: if len(table_row) > 0: table_rows.append(table_row) table_row = cells - y_position = normed_y_start x_position = normed_x_end if len(table_row) > 0: table_rows.append(table_row) @@ -102,8 +116,10 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: cell_bbox = None prev_end = None table_row = [] - for block_idx, block in enumerate(page.char_blocks): - for line_idx, line in enumerate(block["lines"]): + sorted_char_blocks = sort_char_blocks(page.char_blocks) + for block_idx, block in enumerate(sorted_char_blocks): + sorted_block_lines = sort_char_blocks(block["lines"]) + for line_idx, line in enumerate(sorted_block_lines): line_bbox = line["bbox"] intersect_pct = box_intersection_pct(line_bbox, table_box) if intersect_pct < .5: diff --git a/marker/cleaners/text.py b/marker/cleaners/text.py new file mode 100644 index 00000000..56870ec5 --- /dev/null +++ b/marker/cleaners/text.py @@ -0,0 +1,8 @@ +import re + + +def cleanup_text(full_text): + full_text = re.sub(r'\n{3,}', '\n\n', full_text) + full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) + full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces + return full_text \ No newline at end of file diff --git a/marker/convert.py b/marker/convert.py index f5616b24..f3965be6 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -1,4 +1,7 @@ import warnings + +from marker.cleaners.text import cleanup_text + warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings import pypdfium2 as pdfium @@ -131,8 +134,7 @@ def convert_single_pdf( full_text = get_full_text(text_blocks) # Handle empty blocks being joined - full_text = re.sub(r'\n{3,}', '\n\n', full_text) - full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) + full_text = cleanup_text(full_text) # Replace bullet characters with a - full_text = replace_bullets(full_text) diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index da9a914d..a0fada3d 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -103,6 +103,8 @@ def line_separator(line1, line2, block_type, is_continuation=False): return line1 + "\n\n" + line2 elif block_type == "Formula": return line1 + " " + line2 + elif block_type == "Table": + return line1 + "\n\n" + line2 else: return line1 + "\n" + line2 From 9086dd5153a2d21401e5be93dc70fd4d086c976c Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 3 May 2024 16:30:30 -0700 Subject: [PATCH 3/3] Improve table, markdown, and ocr --- README.md | 2 +- marker/cleaners/table.py | 20 ++-- marker/ocr/heuristics.py | 2 +- marker/ocr/recognition.py | 2 +- marker/pdf/extract_text.py | 6 +- marker/postprocessors/markdown.py | 23 ++-- marker/settings.py | 6 +- poetry.lock | 178 ++++++++++++++---------------- pyproject.toml | 1 + 9 files changed, 119 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 396433de..f31beba6 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ First, some configuration. Note that settings can be overridden with env vars, - Your torch device will be automatically detected, but you can manually set it also. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default. - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`. - Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors. -- By default, marker will use `ocrmypdf` for OCR on CPU, and `surya` on GPU. Surya is slower on CPU, but more accurate. `ocrmypdf` also requires external dependencies (see above). You can override the default with the `OCR_ENGINE` setting. +- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). - Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables. diff --git a/marker/cleaners/table.py b/marker/cleaners/table.py index 84e0c674..fe33e0e0 100644 --- a/marker/cleaners/table.py +++ b/marker/cleaners/table.py @@ -6,10 +6,14 @@ import re -def sort_char_blocks(blocks, tolerance=1.25): +def sort_table_blocks(blocks, tolerance=5): vertical_groups = {} for block in blocks: - group_key = round(block["bbox"][1] / tolerance) * tolerance + if hasattr(block, "bbox"): + bbox = block.bbox + else: + bbox = block["bbox"] + group_key = round(bbox[1] / tolerance) * tolerance if group_key not in vertical_groups: vertical_groups[group_key] = [] vertical_groups[group_key].append(block) @@ -17,7 +21,7 @@ def sort_char_blocks(blocks, tolerance=1.25): # Sort each group horizontally and flatten the groups into a single list sorted_blocks = [] for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x["bbox"][0]) + sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) sorted_blocks.extend(sorted_group) return sorted_blocks @@ -42,8 +46,10 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: table_rows = [] table_row = [] x_position = None - for block_idx, block in enumerate(page.blocks): - for line_idx, line in enumerate(block.lines): + sorted_blocks = sort_table_blocks(page.blocks) + for block_idx, block in enumerate(sorted_blocks): + sorted_lines = sort_table_blocks(block.lines) + for line_idx, line in enumerate(sorted_lines): line_bbox = line.bbox intersect_pct = box_intersection_pct(line_bbox, table_box) if intersect_pct < .5 or len(line.spans) == 0: @@ -116,9 +122,9 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: cell_bbox = None prev_end = None table_row = [] - sorted_char_blocks = sort_char_blocks(page.char_blocks) + sorted_char_blocks = sort_table_blocks(page.char_blocks) for block_idx, block in enumerate(sorted_char_blocks): - sorted_block_lines = sort_char_blocks(block["lines"]) + sorted_block_lines = sort_table_blocks(block["lines"]) for line_idx, line in enumerate(sorted_block_lines): line_bbox = line["bbox"] intersect_pct = box_intersection_pct(line_bbox, table_box) diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index b83d5566..2fdb9d8e 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]): return len(full_text.strip()) == 0 -def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6): +def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3): found_lines = 0 for detected_line in page.text_lines.bboxes: diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 535f6507..6da62d8d 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -41,7 +41,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor ocr_success += 1 pages[orig_idx] = page - return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success} + return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method} def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]: diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 56871d6a..bf10e906 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -22,7 +22,11 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: for l in block["lines"]: spans = [] for i, s in enumerate(l["spans"]): - block_text = s["text"].rstrip("\n") + block_text = s["text"] + # Remove trailing newlines and carriage returns (tesseract) + while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]: + block_text = block_text[:-1] + block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks span_obj = Span( text=block_text, # Remove end of line newlines, not spaces diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index a0fada3d..6d9e37f1 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -1,6 +1,7 @@ from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock from marker.schema.page import Page import re +import regex from typing import List @@ -80,31 +81,35 @@ def block_surround(text, block_type): def line_separator(line1, line2, block_type, is_continuation=False): # Should cover latin-derived languages and russian - lowercase_letters = "a-zà-öø-ÿа-яşćăâđêôơưþðæøå" - uppercase_letters = "A-ZÀ-ÖØ-ßА-ЯŞĆĂÂĐÊÔƠƯÞÐÆØÅ" + lowercase_letters = r'(\p{Lo}+|\p{Ll}+)' # Remove hyphen in current line if next line and current line appear to be joined - hyphen_pattern = re.compile(rf'.*[{lowercase_letters}][-]\s?$', re.DOTALL) - if line1 and hyphen_pattern.match(line1) and re.match(rf"^\s?[{lowercase_letters}]", line2): + hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL) + if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2): # Split on — or - from the right line1 = re.split(r"[-—]\s?$", line1)[0] return line1.rstrip() + line2.lstrip() - lowercase_pattern1 = re.compile(rf'.*[{lowercase_letters},]\s?$', re.DOTALL) - lowercase_pattern2 = re.compile(rf'^\s?[{uppercase_letters}{lowercase_letters}]', re.DOTALL) - end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL) + all_letters = r'\p{L}+' + sentence_continuations = r',;(—' + sentence_ends = r'。ๆ.?!' + line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL) + line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL) + sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL) if block_type in ["Title", "Section-header"]: return line1.rstrip() + " " + line2.lstrip() - elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2) and block_type == "Text": + elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text": return line1.rstrip() + " " + line2.lstrip() elif is_continuation: return line1.rstrip() + " " + line2.lstrip() - elif block_type == "Text" and end_pattern.match(line1): + elif block_type == "Text" and sentence_end_pattern.match(line1): return line1 + "\n\n" + line2 elif block_type == "Formula": return line1 + " " + line2 elif block_type == "Table": return line1 + "\n\n" + line2 + elif block_type in ["Formula"]: + return line1.rstrip() + "\n\n" + line2.lstrip() else: return line1 + "\n" + line2 diff --git a/marker/settings.py b/marker/settings.py index 95652ae0..9bdc1490 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -61,11 +61,7 @@ def OCR_ENGINE_INTERNAL(self) -> str: if self.OCR_ENGINE is not None: return self.OCR_ENGINE - # Does not work with mps - if torch.cuda.is_available(): - return "surya" - - return "ocrmypdf" + return "surya" # Texify model TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify diff --git a/poetry.lock b/poetry.lock index afb8b8f7..925699e8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3554,104 +3554,90 @@ rpds-py = ">=0.7.0" [[package]] name = "regex" -version = "2023.12.25" +version = "2024.4.28" description = "Alternative regular expression module, to replace re." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"}, - {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"}, - {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"}, - {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"}, - {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"}, - {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"}, - {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"}, - {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"}, - {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"}, - {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"}, - {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"}, - {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"}, - {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"}, - {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"}, - {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"}, - {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"}, - {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"}, - {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"}, - {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"}, - {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"}, - {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"}, - {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"}, - {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"}, - {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"}, - {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"}, - {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"}, - {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"}, - {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"}, - {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"}, - {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"}, - {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"}, - {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"}, - {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"}, - {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"}, - {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"}, - {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"}, - {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"}, - {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"}, - {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"}, - {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"}, - {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"}, - {file = "regex-2023.12.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:da695d75ac97cb1cd725adac136d25ca687da4536154cdc2815f576e4da11c69"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d126361607b33c4eb7b36debc173bf25d7805847346dd4d99b5499e1fef52bc7"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4719bb05094d7d8563a450cf8738d2e1061420f79cfcc1fa7f0a44744c4d8f73"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dd58946bce44b53b06d94aa95560d0b243eb2fe64227cba50017a8d8b3cd3e2"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22a86d9fff2009302c440b9d799ef2fe322416d2d58fc124b926aa89365ec482"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aae8101919e8aa05ecfe6322b278f41ce2994c4a430303c4cd163fef746e04f"}, - {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e692296c4cc2873967771345a876bcfc1c547e8dd695c6b89342488b0ea55cd8"}, - {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:263ef5cc10979837f243950637fffb06e8daed7f1ac1e39d5910fd29929e489a"}, - {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d6f7e255e5fa94642a0724e35406e6cb7001c09d476ab5fce002f652b36d0c39"}, - {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:88ad44e220e22b63b0f8f81f007e8abbb92874d8ced66f32571ef8beb0643b2b"}, - {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:3a17d3ede18f9cedcbe23d2daa8a2cd6f59fe2bf082c567e43083bba3fb00347"}, - {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d15b274f9e15b1a0b7a45d2ac86d1f634d983ca40d6b886721626c47a400bf39"}, - {file = "regex-2023.12.25-cp37-cp37m-win32.whl", hash = "sha256:ed19b3a05ae0c97dd8f75a5d8f21f7723a8c33bbc555da6bbe1f96c470139d3c"}, - {file = "regex-2023.12.25-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d1047952c0b8104a1d371f88f4ab62e6275567d4458c1e26e9627ad489b445"}, - {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b43523d7bc2abd757119dbfb38af91b5735eea45537ec6ec3a5ec3f9562a1c53"}, - {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:efb2d82f33b2212898f1659fb1c2e9ac30493ac41e4d53123da374c3b5541e64"}, - {file = "regex-2023.12.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7fca9205b59c1a3d5031f7e64ed627a1074730a51c2a80e97653e3e9fa0d415"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086dd15e9435b393ae06f96ab69ab2d333f5d65cbe65ca5a3ef0ec9564dfe770"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e81469f7d01efed9b53740aedd26085f20d49da65f9c1f41e822a33992cb1590"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34e4af5b27232f68042aa40a91c3b9bb4da0eeb31b7632e0091afc4310afe6cb"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9852b76ab558e45b20bf1893b59af64a28bd3820b0c2efc80e0a70a4a3ea51c1"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff100b203092af77d1a5a7abe085b3506b7eaaf9abf65b73b7d6905b6cb76988"}, - {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cc038b2d8b1470364b1888a98fd22d616fba2b6309c5b5f181ad4483e0017861"}, - {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:094ba386bb5c01e54e14434d4caabf6583334090865b23ef58e0424a6286d3dc"}, - {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5cd05d0f57846d8ba4b71d9c00f6f37d6b97d5e5ef8b3c3840426a475c8f70f4"}, - {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9aa1a67bbf0f957bbe096375887b2505f5d8ae16bf04488e8b0f334c36e31360"}, - {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:98a2636994f943b871786c9e82bfe7883ecdaba2ef5df54e1450fa9869d1f756"}, - {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37f8e93a81fc5e5bd8db7e10e62dc64261bcd88f8d7e6640aaebe9bc180d9ce2"}, - {file = "regex-2023.12.25-cp38-cp38-win32.whl", hash = "sha256:d78bd484930c1da2b9679290a41cdb25cc127d783768a0369d6b449e72f88beb"}, - {file = "regex-2023.12.25-cp38-cp38-win_amd64.whl", hash = "sha256:b521dcecebc5b978b447f0f69b5b7f3840eac454862270406a39837ffae4e697"}, - {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f7bc09bc9c29ebead055bcba136a67378f03d66bf359e87d0f7c759d6d4ffa31"}, - {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e14b73607d6231f3cc4622809c196b540a6a44e903bcfad940779c80dffa7be7"}, - {file = "regex-2023.12.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9eda5f7a50141291beda3edd00abc2d4a5b16c29c92daf8d5bd76934150f3edc"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"}, - {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"}, - {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"}, - {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"}, - {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"}, - {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"}, - {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"}, - {file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"}, - {file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"}, - {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, ] [[package]] @@ -5004,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "28d4e2918b3d71048f67e0569e7dfa5b729cc553c33e97bf371e3b8c04803fd8" +content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb" diff --git a/pyproject.toml b/pyproject.toml index be9ecd3e..ff0c2ff4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ rapidfuzz = "^3.8.1" surya-ocr = "^0.4.0" filetype = "^1.2.0" pdftext = "^0.3.4" +regex = "^2024.4.28" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0"