From 847e1a5dfd226bbad30a5c86b9b077dabb08fce0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:11:53 +0100 Subject: [PATCH 1/3] convert_textlines: implement by-index order option --- ocrd_page_to_alto/convert.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 2d2f5a5..df2af11 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -69,6 +69,7 @@ def __init__( textequiv_index=0, textequiv_fallback_strategy='last', region_order='document', + textline_order='document', page_filename=None, dummy_textline=True, dummy_word=True, @@ -86,6 +87,7 @@ def __init__( textequiv_index (int): @index of the TextEquiv to choose textequiv_fallback_strategy ("raise"|"first"|"last"): Strategy to handle case of no matchin TextEquiv by textequiv_index region_order ("document"|"reading-order"|"reading-order-only"): The order in which to iterate over regions. + textline_order ("document"|"index"|"textline-order"): The order in which to iterate over textlines. dummy_textline (boolean): Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine dummy_word (boolean): Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word """ @@ -98,6 +100,7 @@ def __init__( self.trailing_dash_to_hyp = trailing_dash_to_hyp self.dummy_textline = dummy_textline self.region_order = region_order + self.textline_order = textline_order self.dummy_word = dummy_word self.logger = logger if logger else getLogger('page-to-alto') if pcgts: @@ -274,7 +277,19 @@ def convert_metadata(self): def _convert_textlines(self, reg_alto, reg_page): if self.dummy_textline: self.set_dummy_line_for_region(reg_page) - for line_page in reg_page.get_TextLine(): + lines = reg_page.get_TextLine() + if self.textline_order == 'document': + : + elif self.textline_order == 'index': + def by_index(line): + if line.index is None: + return 0 # keep order + return line.index + lines = sorted(lines, key=by_index) + elif self.textline_order == 'textline-order': + # something with reg_page.textLineOrder or reg_page.get_parent_.textLineOrder + raise Exception("@textLineOrder semantics not implemented; cf. PRImA-Research-Lab/PAGE-XML#26") + for line_page in lines: is_empty_line = not(line_page.get_TextEquiv() and line_page.get_TextEquiv()[0].get_Unicode()) and not(line_page.get_Word()) if is_empty_line and self.skip_empty_lines: self.logger.debug("Skipping empty line '%s'", line_page.id) From d81e8e68553205277be8a9cec78660c2ce513563 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:14:20 +0100 Subject: [PATCH 2/3] add textline-order option --- ocrd_page_to_alto/cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index fd8f3f7..ea30800 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -18,10 +18,12 @@ @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index') @click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") @click.option('--region-order', default='document', help="Order in which to iterate over the regions", type=click.Choice(['document', 'reading-order', 'reading-order-only'])) +@click.option('--textline-order', default='document', help="Order in which to iterate over the textlines", type=click.Choice(['document', 'index', 'textline-order'])) @click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)', type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True)) @click.argument('filename', type=click.Path(dir_okay=False, exists=True)) -def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, region_order, output_file, filename): +def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, + textequiv_index, textequiv_fallback_strategy, region_order, textline_order, output_file, filename): """ Convert PAGE to ALTO """ @@ -37,7 +39,8 @@ def main(log_level, alto_version, check_words, check_border, skip_empty_lines, t dummy_word=dummy_word, textequiv_index=textequiv_index, textequiv_fallback_strategy=textequiv_fallback_strategy, - region_order=region_order + region_order=region_order, + textline_order=textline_order ) converter.convert() with open(1 if output_file == '-' else output_file, 'w') as output: From 5c974a10546f6ffb08edf91925c5f123c5455e41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:16:10 +0100 Subject: [PATCH 3/3] fix typo --- ocrd_page_to_alto/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index df2af11..06dc868 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -279,7 +279,7 @@ def _convert_textlines(self, reg_alto, reg_page): self.set_dummy_line_for_region(reg_page) lines = reg_page.get_TextLine() if self.textline_order == 'document': - : + pass elif self.textline_order == 'index': def by_index(line): if line.index is None: