Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert_textlines: implement by-index order option #29

Merged
merged 3 commits into from
Sep 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions ocrd_page_to_alto/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
@click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index')
@click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element")
@click.option('--region-order', default='document', help="Order in which to iterate over the regions", type=click.Choice(['document', 'reading-order', 'reading-order-only']))
@click.option('--textline-order', default='document', help="Order in which to iterate over the textlines", type=click.Choice(['document', 'index', 'textline-order']))
@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)',
type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True))
@click.argument('filename', type=click.Path(dir_okay=False, exists=True))
def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, region_order, output_file, filename):
def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word,
textequiv_index, textequiv_fallback_strategy, region_order, textline_order, output_file, filename):
"""
Convert PAGE to ALTO
"""
Expand All @@ -37,7 +39,8 @@ def main(log_level, alto_version, check_words, check_border, skip_empty_lines, t
dummy_word=dummy_word,
textequiv_index=textequiv_index,
textequiv_fallback_strategy=textequiv_fallback_strategy,
region_order=region_order
region_order=region_order,
textline_order=textline_order
)
converter.convert()
with open(1 if output_file == '-' else output_file, 'w') as output:
Expand Down
17 changes: 16 additions & 1 deletion ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
textequiv_index=0,
textequiv_fallback_strategy='last',
region_order='document',
textline_order='document',
page_filename=None,
dummy_textline=True,
dummy_word=True,
Expand All @@ -86,6 +87,7 @@ def __init__(
textequiv_index (int): @index of the TextEquiv to choose
textequiv_fallback_strategy ("raise"|"first"|"last"): Strategy to handle case of no matchin TextEquiv by textequiv_index
region_order ("document"|"reading-order"|"reading-order-only"): The order in which to iterate over regions.
textline_order ("document"|"index"|"textline-order"): The order in which to iterate over textlines.
dummy_textline (boolean): Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine
dummy_word (boolean): Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word
"""
Expand All @@ -98,6 +100,7 @@ def __init__(
self.trailing_dash_to_hyp = trailing_dash_to_hyp
self.dummy_textline = dummy_textline
self.region_order = region_order
self.textline_order = textline_order
self.dummy_word = dummy_word
self.logger = logger if logger else getLogger('page-to-alto')
if pcgts:
Expand Down Expand Up @@ -274,7 +277,19 @@ def convert_metadata(self):
def _convert_textlines(self, reg_alto, reg_page):
if self.dummy_textline:
self.set_dummy_line_for_region(reg_page)
for line_page in reg_page.get_TextLine():
lines = reg_page.get_TextLine()
if self.textline_order == 'document':
pass
elif self.textline_order == 'index':
def by_index(line):
if line.index is None:
return 0 # keep order
return line.index
lines = sorted(lines, key=by_index)
elif self.textline_order == 'textline-order':
# something with reg_page.textLineOrder or reg_page.get_parent_.textLineOrder
raise Exception("@textLineOrder semantics not implemented; cf. PRImA-Research-Lab/PAGE-XML#26")
for line_page in lines:
is_empty_line = not(line_page.get_TextEquiv() and line_page.get_TextEquiv()[0].get_Unicode()) and not(line_page.get_Word())
if is_empty_line and self.skip_empty_lines:
self.logger.debug("Skipping empty line '%s'", line_page.id)
Expand Down