Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues #109

Merged
merged 3 commits into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ First, some configuration. Note that settings can be overridden with env vars,
- Your torch device will be automatically detected, but you can manually set it also. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
- If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
- Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
- By default, marker will use `ocrmypdf` for OCR on CPU, and `surya` on GPU. Surya is slower on CPU, but more accurate. `ocrmypdf` also requires external dependencies (see above). You can override the default with the `OCR_ENGINE` setting.
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
- Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables.


Expand Down
4 changes: 4 additions & 0 deletions marker/cleaners/fontstyle.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
span.italic = True

font_weights.append(span.font_weight)

if len(font_weights) == 0:
return

font_weights = np.array(font_weights)
bold_thresh = np.percentile(font_weights, 90)
bold_thresh_lower = np.percentile(font_weights, 75)
Expand Down
140 changes: 115 additions & 25 deletions marker/cleaners/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,27 @@
import re


def sort_table_blocks(blocks, tolerance=5):
vertical_groups = {}
for block in blocks:
if hasattr(block, "bbox"):
bbox = block.bbox
else:
bbox = block["bbox"]
group_key = round(bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
sorted_blocks.extend(sorted_group)

return sorted_blocks


def replace_dots(text):
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
Expand All @@ -15,41 +36,100 @@ def replace_dots(text):
return text


def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
def replace_newlines(text):
# Replace all newlines
newline_pattern = re.compile(r'[\r\n]+')
return newline_pattern.sub(' ', text.strip())


def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
table_rows = []
row_y_coord = None
table_row = []
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
x_position = None
sorted_blocks = sort_table_blocks(page.blocks)
for block_idx, block in enumerate(sorted_blocks):
sorted_lines = sort_table_blocks(block.lines)
for line_idx, line in enumerate(sorted_lines):
line_bbox = line.bbox
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .5 or len(line.spans) == 0:
continue
normed_y_start = line_bbox[1] / page.height
if row_y_coord is None or abs(normed_y_start - row_y_coord) < y_tol:
table_row.extend([s.text for s in line.spans])
normed_x_start = line_bbox[0] / page.width
normed_x_end = line_bbox[2] / page.width

cells = [[s.bbox, s.text] for s in line.spans]
if x_position is None or normed_x_start > x_position - space_tol:
# Same row
table_row.extend(cells)
else:
table_rows.append(table_row)
table_row = [s.text for s in line.spans]
row_y_coord = normed_y_start
# New row
if len(table_row) > 0:
table_rows.append(table_row)
table_row = cells
x_position = normed_x_end
if len(table_row) > 0:
table_rows.append(table_row)
table_rows = assign_cells_to_columns(table_rows)
return table_rows


def get_table_pdftext(page: Page, table_box) -> List[List[str]]:
def assign_cells_to_columns(rows, round_factor=4, tolerance=4):
left_edges = []
right_edges = []
centers = []

for row in rows:
for cell in row:
left_edges.append(cell[0][0] / round_factor * round_factor)
right_edges.append(cell[0][2] / round_factor * round_factor)
centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor)

unique_left = sorted(list(set(left_edges)))
unique_right = sorted(list(set(right_edges)))
unique_center = sorted(list(set(centers)))

# Find list with minimum length
separators = min([unique_left, unique_right, unique_center], key=len)

new_rows = []
for row in rows:
new_row = {}
last_col_index = -1
for cell in row:
left_edge = cell[0][0]
column_index = -1
for i, separator in enumerate(separators):
if left_edge - tolerance < separator and last_col_index < i:
column_index = i
break
if column_index == -1:
column_index = cell[0][0] # Assign a new column
new_row[column_index] = cell[1]
last_col_index = column_index

flat_row = [cell[1] for cell in sorted(new_row.items())]
min_column_index = min(new_row.keys())
flat_row = [""] * min_column_index + flat_row
new_rows.append(flat_row)

return new_rows


def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
page_width = page.width
table_rows = []
for block_idx, block in enumerate(page.char_blocks):
for line_idx, line in enumerate(block["lines"]):
table_cell = ""
cell_bbox = None
prev_end = None
table_row = []
sorted_char_blocks = sort_table_blocks(page.char_blocks)
for block_idx, block in enumerate(sorted_char_blocks):
sorted_block_lines = sort_table_blocks(block["lines"])
for line_idx, line in enumerate(sorted_block_lines):
line_bbox = line["bbox"]
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .5:
continue
prev_end = None
table_row = []
table_cell = ""
cell_bbox = None
for span in line["spans"]:
for char in span["chars"]:
x_start, y_start, x_end, y_end = char["bbox"]
Expand All @@ -60,18 +140,28 @@ def get_table_pdftext(page: Page, table_box) -> List[List[str]]:

x_start /= page_width
x_end /= page_width
if prev_end is None or x_start - prev_end < .01:
cell_content = replace_dots(replace_newlines(table_cell))
if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell
table_cell += char["char"]
else:
table_row.append(replace_dots(table_cell.strip()))
elif x_start > prev_end - space_tol: # Check if we are on the same line
if len(table_cell) > 0:
table_row.append((cell_bbox, cell_content))
table_cell = char["char"]
cell_bbox = char["bbox"]
else: # New line and cell
if len(table_cell) > 0:
table_row.append((cell_bbox, cell_content))
table_cell = char["char"]
cell_bbox = char["bbox"]
if len(table_row) > 0:
table_rows.append(table_row)
table_row = []
prev_end = x_end
if len(table_cell) > 0:
table_row.append(replace_dots(table_cell.strip()))
table_cell = ""
if len(table_row) > 0:
table_rows.append(table_row)
if len(table_cell) > 0:
table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
if len(table_row) > 0:
table_rows.append(table_row)
table_rows = assign_cells_to_columns(table_rows)
return table_rows


Expand Down
8 changes: 8 additions & 0 deletions marker/cleaners/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import re


def cleanup_text(full_text):
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
return full_text
6 changes: 4 additions & 2 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import warnings

from marker.cleaners.text import cleanup_text

warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

import pypdfium2 as pdfium
Expand Down Expand Up @@ -131,8 +134,7 @@ def convert_single_pdf(
full_text = get_full_text(text_blocks)

# Handle empty blocks being joined
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = cleanup_text(full_text)

# Replace bullet characters with a -
full_text = replace_bullets(full_text)
Expand Down
46 changes: 36 additions & 10 deletions marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from marker.equations.inference import get_total_texify_tokens, get_latex_batched
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.schema.block import Line, Span, Block, bbox_from_lines
from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
from marker.settings import settings


Expand All @@ -28,11 +28,7 @@ def find_equation_blocks(page, processor):
equation_lines[region_idx].append(line)

if region_idx not in insert_points:
# Insert before the block if line is at the beginning of the block, otherwise after the block
if line_idx <= len(block.lines) // 2:
insert_points[region_idx] = block_idx
else:
insert_points[region_idx] = block_idx + 1
insert_points[region_idx] = (block_idx, line_idx)

block_lines_to_remove = defaultdict(set)
for region_idx, equation_region in enumerate(equation_regions):
Expand All @@ -44,8 +40,13 @@ def find_equation_blocks(page, processor):
equation_bbox = bbox_from_lines(equation_block)

total_tokens = get_total_texify_tokens(block_text, processor)
selected_blocks = (equation_insert, total_tokens, block_text, equation_bbox)
equation_insert_line_idx = equation_insert[1]
equation_insert_line_idx -= len(
[x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])

selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
if total_tokens < settings.TEXIFY_MODEL_MAX:
# Account for the lines we're about to remove
for item in lines_to_remove[region_idx]:
block_lines_to_remove[item[0]].add(item[1])
equation_blocks.append(selected_blocks)
Expand All @@ -58,12 +59,19 @@ def find_equation_blocks(page, processor):
return equation_blocks


def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count):
for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
if block_idx >= insert_block_idx:
page_equation_blocks[idx][0] += insert_count


def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor):
converted_spans = []
idx = 0
success_count = 0
fail_count = 0
for block_number, (insert_point, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
total_inserted = 0
for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
latex_text = predictions[block_number]
conditions = [
get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't get to the overall token max, indicates run-on
Expand Down Expand Up @@ -97,7 +105,25 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
new_block.lines[0].spans[0].text = latex_text
converted_spans.append(deepcopy(new_block.lines[0].spans[0]))

page_blocks.blocks.insert(insert_point, new_block)
# Add in the new LaTeX block
if insert_line_idx == 0:
page_blocks.blocks.insert(insert_block_idx, new_block)
increment_insert_points(page_equation_blocks, insert_block_idx, 1)
elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines):
page_blocks.blocks.insert(insert_block_idx + 1, new_block)
increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1)
else:
new_blocks = []
for block_idx, block in enumerate(page_blocks.blocks):
if block_idx == insert_block_idx:
split_block = split_block_lines(block, insert_line_idx)
new_blocks.append(split_block[0])
new_blocks.append(new_block)
new_blocks.append(split_block[1])
increment_insert_points(page_equation_blocks, insert_block_idx, 2)
else:
new_blocks.append(block)
page_blocks.blocks = new_blocks

return success_count, fail_count, converted_spans

Expand All @@ -117,7 +143,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
token_counts = []
for page_idx, page_equation_blocks in enumerate(equation_blocks):
page_obj = doc[page_idx]
for equation_idx, (insert_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)

images.append(png_image)
Expand Down
4 changes: 2 additions & 2 deletions marker/ocr/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
return len(full_text.strip()) == 0


def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
found_lines = 0
for detected_line in page.text_lines.bboxes:

Expand All @@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6)
total_intersection = 0
for block in page.blocks:
for line in block.lines:
intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
total_intersection += intersection_pct
if total_intersection > intersect_thresh:
found_lines += 1
Expand Down
6 changes: 3 additions & 3 deletions marker/ocr/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
ocr_success += 1
pages[orig_idx] = page

return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}


def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
Expand Down Expand Up @@ -120,8 +120,8 @@ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
out_pdf,
language=langs[0],
output_type="pdf",
redo_ocr=None if settings.OCR_ALL_PAGES else True,
force_ocr=True if settings.OCR_ALL_PAGES else None,
redo_ocr=None,
force_ocr=True,
progress_bar=False,
optimize=False,
fast_web_view=1e6,
Expand Down
15 changes: 12 additions & 3 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
for l in block["lines"]:
spans = []
for i, s in enumerate(l["spans"]):
block_text = s["text"].rstrip("\n")
block_text = s["text"]
# Remove trailing newlines and carriage returns (tesseract)
while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
block_text = block_text[:-1]

block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
span_obj = Span(
text=block_text.rstrip("\n"), # Remove end of line newlines, not spaces
text=block_text, # Remove end of line newlines, not spaces
bbox=s["bbox"],
span_id=f"{pnum}_{span_id}",
font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
Expand All @@ -49,10 +53,15 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
# Only select blocks with lines
if len(block_lines) > 0:
page_blocks.append(block_obj)

page_bbox = page["bbox"]
page_width = abs(page_bbox[2] - page_bbox[0])
page_height = abs(page_bbox[3] - page_bbox[1])
page_bbox = [0, 0, page_width, page_height]
out_page = Page(
blocks=page_blocks,
pnum=page["page"],
bbox=page["bbox"],
bbox=page_bbox,
rotation=page["rotation"],
char_blocks=page["blocks"]
)
Expand Down
Empty file added marker/postprocessors/images.py
Empty file.
Loading
Loading