From aea44244b3e8c5ecc628634951aa5ed47bc5c6c8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 13 Apr 2021 12:54:44 +0200 Subject: [PATCH] set @TYPE as well as TAGREFS from PAGE @type, #4 --- README.md | 4 ++-- ocrd_page_to_alto/styles.py | 1 + tests/test_convert.py | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef206a8..38d87e6 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ * [x] TextStyle * [x] ParagraphStyle * [x] table regions -* [ ] recursive regions +* [ ] recursive regions, # * [ ] rotation * [x] reading order -* [x] input PAGE-XML not having words +* [x] input PAGE-XML not having words #5 * [x] multiple pc:TextEquivs * [x] language * [X] ~~script~~ no equivalent in ALTO :( diff --git a/ocrd_page_to_alto/styles.py b/ocrd_page_to_alto/styles.py index 822467e..0645b8b 100644 --- a/ocrd_page_to_alto/styles.py +++ b/ocrd_page_to_alto/styles.py @@ -122,5 +122,6 @@ def __init__(self): def set_alto_tag_from_type(self, reg_alto, reg_page): typ = reg_page.get_type() if hasattr(reg_page, 'get_type') else None if typ: + reg_alto.set('TYPE', typ) reg_alto.set('TAGREFS', self.get_id(label=typ)) diff --git a/tests/test_convert.py b/tests/test_convert.py index 88b2d38..183c03d 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -58,6 +58,8 @@ def test_layouttag(): c = OcrdPageAltoConverter(page_filename='tests/data/layouttag.page.xml').convert() tree = ET.fromstring(str(c).encode('utf-8')) assert [x.get('LABEL') for x in tree.xpath('//alto:Tags/alto:LayoutTag', namespaces=NAMESPACES)] == ['paragraph'] + assert len(tree.xpath('//*[@TYPE="paragraph"]')) == 1 + assert len(tree.xpath('//*[@TYPE="catch-word"]')) == 0 # @TYPE only allowed for BlockType def test_pararaphstyle(): c = OcrdPageAltoConverter(page_filename='tests/data/align.page.xml').convert()