Skip to content

Commit

Permalink
convert languages, PRImA-Research-Lab/PAGE-XML#27, #3
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Apr 8, 2021
1 parent 570a917 commit f1b67bd
Show file tree
Hide file tree
Showing 7 changed files with 3,803 additions and 3 deletions.
8 changes: 7 additions & 1 deletion ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from .utils import (
set_alto_id_from_page_id,
set_alto_xywh_from_coords,
set_alto_lang_from_page_lang,
set_alto_shape_from_coords,
set_alto_xywh_from_coords,
setxml
)
from .styles import TextStylesManager
Expand Down Expand Up @@ -171,6 +172,7 @@ def _convert_textlines(self, reg_alto, reg_page):
set_alto_id_from_page_id(line_alto, line_page)
set_alto_xywh_from_coords(line_alto, line_page)
set_alto_shape_from_coords(line_alto, line_page)
set_alto_lang_from_page_lang(line_alto, line_page)
self.textstyle_mgr.set_alto_styleref_from_textstyle(line_alto, line_page)
# XXX ALTO does not allow TextLine without at least one String
if is_empty_line:
Expand All @@ -181,6 +183,7 @@ def _convert_textlines(self, reg_alto, reg_page):
set_alto_id_from_page_id(word_alto, word_page)
set_alto_xywh_from_coords(word_alto, word_page)
set_alto_shape_from_coords(word_alto, word_page)
set_alto_lang_from_page_lang(word_alto, word_page)
word_alto.set('CONTENT', word_page.get_TextEquiv()[0].get_Unicode())

def _convert_table(self, parent_alto, parent_page, level=0):
Expand All @@ -191,11 +194,13 @@ def _convert_table(self, parent_alto, parent_page, level=0):
if parent_page.get_TextRegion():
reg_alto = ET.SubElement(parent_alto, 'ComposedBlock')
set_alto_id_from_page_id(reg_alto, parent_page) # TODO not unique!
set_alto_lang_from_page_lang(reg_alto, parent_page)
for reg_page in parent_page.get_TextRegion():
self._convert_table(reg_alto, reg_page, level=level + 1)
else:
textblock_alto = ET.SubElement(parent_alto, 'TextBlock')
set_alto_id_from_page_id(textblock_alto, parent_page)
set_alto_lang_from_page_lang(textblock_alto, parent_page)
self._convert_textlines(textblock_alto, parent_page)

def convert_text(self):
Expand All @@ -208,6 +213,7 @@ def convert_text(self):
set_alto_id_from_page_id(reg_alto, reg_page)
set_alto_xywh_from_coords(reg_alto, reg_page)
set_alto_shape_from_coords(reg_alto, reg_page)
set_alto_lang_from_page_lang(reg_alto, reg_page)
self.textstyle_mgr.set_alto_styleref_from_textstyle(reg_alto, reg_page)
if reg_page_type == 'Text':
self._convert_textlines(reg_alto, reg_page)
Expand Down
10 changes: 10 additions & 0 deletions ocrd_page_to_alto/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from lxml import etree as ET
from ocrd_utils import xywh_from_points
import langcodes

def setxml(el, name, val):
el.set(name, str(val))
Expand All @@ -20,3 +21,12 @@ def set_alto_shape_from_coords(reg_alto, reg_page):

def set_alto_id_from_page_id(reg_alto, reg_page):
setxml(reg_alto, 'ID', reg_page.id)

def set_alto_lang_from_page_lang(reg_alto, reg_page):
for prefix in ('primaryL', 'secondaryL', 'l'):
lang_page = getattr(reg_page, f'{prefix}anguage', None)
if lang_page:
lang_alto = langcodes.find(lang_page).to_alpha3()
setxml(reg_alto, 'LANG',lang_alto)
return

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
ocrd >= 2.23.2
lxml
langcodes[data] >= 3.1.0
1,105 changes: 1,105 additions & 0 deletions tests/data/alto-4-2.xsd

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions tests/data/language.page.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-OCR-CALAMARI_00001">
<pc:Page imageFilename="OCR-D-IMG/044417.jpg" imageWidth="3195" imageHeight="4370" type="content" primaryLanguage="Welsh" secondaryLanguage="Urdu" primaryScript="Avst - Avestan" secondaryScript="Cans - Unified Canadian Aboriginal Syllabics">
<pc:Border>
<pc:Coords points="61,254 2770,254 2770,4360 61,4360"/>
</pc:Border>
<pc:TextRegion id="r1" primaryLanguage="Volapük" secondaryLanguage="Interlingua" primaryScript="Cham - Cham" secondaryScript="Buhd - Buhid">
<pc:Coords points="0,0 1,1 1,0 0,1"/>
<pc:TextLine id="r1-l1" primaryLanguage="Norwegian Bokmål">
<pc:Coords points="0,0 1,1 1,0 0,1"/>
<pc:Word id="r1-l1-w1" language="Esperanto">
<pc:Coords points="0,0 1,1 1,0 0,1"/>
<pc:TextEquiv>
<pc:Unicode>patrofikulo</pc:Unicode>
</pc:TextEquiv>
</pc:Word>
<pc:TextEquiv>
<pc:Unicode>patrofikulo</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
</pc:TextRegion>
</pc:Page>
</pc:PcGts>
Loading

0 comments on commit f1b67bd

Please sign in to comment.