OCR-D · kba · Sep 13, 2022 · Feb 7, 2022 · Feb 7, 2022 · Feb 7, 2022
diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py
@@ -18,10 +18,12 @@
 @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index')
 @click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element")
 @click.option('--region-order', default='document', help="Order in which to iterate over the regions", type=click.Choice(['document', 'reading-order', 'reading-order-only']))
+@click.option('--textline-order', default='document', help="Order in which to iterate over the textlines", type=click.Choice(['document', 'index', 'textline-order']))
 @click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)',
               type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True))
 @click.argument('filename',  type=click.Path(dir_okay=False, exists=True))
-def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, region_order, output_file, filename):
+def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, 
+         textequiv_index, textequiv_fallback_strategy, region_order, textline_order, output_file, filename):
     """
     Convert PAGE to ALTO
     """
@@ -37,7 +39,8 @@ def main(log_level, alto_version, check_words, check_border, skip_empty_lines, t
         dummy_word=dummy_word,
         textequiv_index=textequiv_index,
         textequiv_fallback_strategy=textequiv_fallback_strategy,
-        region_order=region_order
+        region_order=region_order,
+        textline_order=textline_order
     )
     converter.convert()
     with open(1 if output_file == '-' else output_file, 'w') as output:

diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py
@@ -69,6 +69,7 @@ def __init__(
         textequiv_index=0,
         textequiv_fallback_strategy='last',
         region_order='document',
+        textline_order='document',
         page_filename=None,
         dummy_textline=True,
         dummy_word=True,
@@ -86,6 +87,7 @@ def __init__(
             textequiv_index (int): @index of the TextEquiv to choose
             textequiv_fallback_strategy ("raise"|"first"|"last"): Strategy to handle case of no matchin TextEquiv by textequiv_index
             region_order ("document"|"reading-order"|"reading-order-only"): The order in which to iterate over regions.
+            textline_order ("document"|"index"|"textline-order"): The order in which to iterate over textlines.
             dummy_textline (boolean): Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine
             dummy_word (boolean): Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word
         """
@@ -98,6 +100,7 @@ def __init__(
         self.trailing_dash_to_hyp = trailing_dash_to_hyp
         self.dummy_textline = dummy_textline
         self.region_order = region_order
+        self.textline_order = textline_order
         self.dummy_word = dummy_word
         self.logger = logger if logger else getLogger('page-to-alto')
         if pcgts:
@@ -274,7 +277,19 @@ def convert_metadata(self):
     def _convert_textlines(self, reg_alto, reg_page):
         if self.dummy_textline:
             self.set_dummy_line_for_region(reg_page)
-        for line_page in reg_page.get_TextLine():
+        lines = reg_page.get_TextLine()
+        if self.textline_order == 'document':
+            pass
+        elif self.textline_order == 'index':
+            def by_index(line):
+                if line.index is None:
+                    return 0 # keep order
+                return line.index
+            lines = sorted(lines, key=by_index)
+        elif self.textline_order == 'textline-order':
+            # something with reg_page.textLineOrder or reg_page.get_parent_.textLineOrder
+            raise Exception("@textLineOrder semantics not implemented; cf. PRImA-Research-Lab/PAGE-XML#26")
+        for line_page in lines:
             is_empty_line = not(line_page.get_TextEquiv() and line_page.get_TextEquiv()[0].get_Unicode()) and not(line_page.get_Word())
             if is_empty_line and self.skip_empty_lines:
                 self.logger.debug("Skipping empty line '%s'", line_page.id)