Skip to content

Commit

Permalink
add hyphen handling
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 6, 2024
1 parent 91f928c commit 2b324e7
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
1 change: 1 addition & 0 deletions marker/providers/pdf_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def get_lines(spans):
]

if span["text"].endswith("\r\n") or span["text"].endswith("\x02"):
span["text"] = span["text"].replace("\x02", "-")
lines.append(current_line)
current_line = None

Expand Down
10 changes: 8 additions & 2 deletions marker/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,15 @@ def assemble_html(self, document, child_blocks, parent_structure):
for c in child_blocks:
template += c.html

raw_text = remove_tags(template).strip()
structure_idx = parent_structure.index(self.id)
if not structure_idx < len(parent_structure) - 1:
template = template.removesuffix(' ') # strip any trailing whitespace from the last line
if structure_idx < len(parent_structure) - 1:
next_block_id = parent_structure[structure_idx + 1]
next_line = document.get_block(next_block_id)
next_line_raw_text = next_line.raw_text(document)
template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
else:
template = template.strip(' ') # strip any trailing whitespace from the last line

return template

Expand Down

0 comments on commit 2b324e7

Please sign in to comment.