From 99e80c54eb0359f6be857972e52cc8a92eaa4640 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 7 Apr 2021 14:55:40 +0200 Subject: [PATCH] conversion of TextStyle --- ocrd_page_to_alto/convert.py | 19 ++++++-- ocrd_page_to_alto/styles.py | 92 ++++++++++++++++++++++++++++++++++++ ocrd_page_to_alto/utils.py | 7 ++- tests/test_styles.py | 27 +++++++++++ 4 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 ocrd_page_to_alto/styles.py create mode 100644 tests/test_styles.py diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 07da7a5..0fa7458 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -10,6 +10,7 @@ set_alto_shape_from_coords, setxml ) +from .styles import TextStylesManager NAMESPACES = {**NAMESPACES_} NAMESPACES['xsi'] = 'http://www.w3.org/2001/XMLSchema-instance' @@ -62,6 +63,7 @@ def __init__(self, *, check_words=True, check_border=True, skip_empty_lines=Fals raise ValueError("The PAGE-XML to transform contains neither Border nor PrintSpace") self.alto_alto, self.alto_description, self.alto_styles, self.alto_tags, self.alto_page = self.create_alto() self.alto_printspace = self.convert_border() + self.textstyle_mgr = TextStylesManager() def __str__(self): return ET.tostring(self.alto_alto, pretty_print=True).decode('utf-8') @@ -83,6 +85,10 @@ def convert(self): self.convert_metadata() self.convert_text() self.convert_reading_order() + self.convert_styles() + + def convert_styles(self): + self.textstyle_mgr.to_xml(self.alto_styles) def convert_reading_order(self): index_order = [x.id for x in self.page_page.get_AllRegions(order='reading-order', depth=1)] @@ -151,13 +157,14 @@ def _convert_textlines(self, reg_alto, reg_page): if is_empty_line and self.skip_empty_lines: return line_alto = ET.SubElement(reg_alto, 'TextLine') - if is_empty_line: - word_alto_empty = ET.SubElement(line_alto, 'String') - word_alto_empty.set('CONTENT', '') set_alto_id_from_page_id(line_alto, line_page) set_alto_xywh_from_coords(line_alto, line_page) set_alto_shape_from_coords(line_alto, line_page) + self.set_alto_styleref_from_textstyle(line_alto, line_page) # XXX ALTO does not allow TextLine without at least one String + if is_empty_line: + word_alto_empty = ET.SubElement(line_alto, 'String') + word_alto_empty.set('CONTENT', '') for word_page in line_page.get_Word(): word_alto = ET.SubElement(line_alto, 'String') set_alto_id_from_page_id(word_alto, word_page) @@ -175,6 +182,7 @@ def convert_text(self): set_alto_id_from_page_id(reg_alto, reg_page) set_alto_xywh_from_coords(reg_alto, reg_page) set_alto_shape_from_coords(reg_alto, reg_page) + self.set_alto_styleref_from_textstyle(reg_alto, reg_page) if reg_page_type == 'Text': self._convert_textlines(reg_alto, reg_page) elif reg_page_type == 'Table': @@ -186,3 +194,8 @@ def convert_text(self): else: raise ValueError('Unhandled region type %s' % reg_page_type) + + def set_alto_styleref_from_textstyle(self, reg_alto, reg_page): + textstyle = reg_page.get_TextStyle() if hasattr(reg_page, 'get_TextStyle') else None + if textstyle: + reg_alto.set('STYLEREFS', self.textstyle_mgr.from_textstyle(textstyle)) diff --git a/ocrd_page_to_alto/styles.py b/ocrd_page_to_alto/styles.py new file mode 100644 index 0000000..68c7957 --- /dev/null +++ b/ocrd_page_to_alto/styles.py @@ -0,0 +1,92 @@ +from lxml import etree as ET + +class TextStylesManager(): + + def __init__(self): + self._styles = set() + self.fields = ('font_family', 'font_type', 'font_width', 'font_size', 'font_color', 'font_style') + self.output_element = 'TextStyle' + + def get_style_id(self, **kwargs): + if any(k not in self.fields for k in kwargs): + raise ValueError(f"Unknown fields in {kwargs}") + key = '---'.join([str(kwargs.get(x, None)).replace(' ', '%20') for x in self.fields]) + if key not in self.styles: + self._styles.add(key) + return key + + @property + def styles(self): + ret = {} + for key in self._styles: + ret[key] = {} + vals = key.split('---') + for field_idx, field in enumerate(self.fields): + ret[key][field] = vals[field_idx].replace('%20', ' ') + return ret + + def from_textstyle(self, textstyle): + kwargs = {} + print(textstyle) + kwargs['font_family'] = textstyle.fontFamily + kwargs['font_type'] = 'serif' if textstyle.serif else 'sans-serif' + kwargs['font_width'] = 'fixed' if textstyle.monospace else 'proportional' + if textstyle.fontSize: + kwargs['font_size'] = textstyle.fontSize + if textstyle.textColourRgb: + b = textstyle.textColourRgb // 65336 + g = (textstyle.textColourRgb - (b * 65336)) // 256 + r = textstyle.textColourRgb - (b * 65336) - (g * 256) + kwargs['font_color'] = '%2x%2x%2x' % (r, g, b) + if textstyle.textColour: + # https://en.wikipedia.org/wiki/Web_colors + rgb = 'ffffff' if textstyle.textColour == 'white' else \ + '000000' if textstyle.textColour == 'black' else \ + 'ff0000' if textstyle.textColour == 'red' else \ + '800000' if textstyle.textColour == 'brown' else \ + '00ffff' if textstyle.fontColour == 'cyan' else \ + '00ff00' if textstyle.fontColour == 'green' else \ + '999999' if textstyle.fontColour == 'grey' else \ + '4b0082' if textstyle.fontColour == 'indigo' else \ + 'ff00ff' if textstyle.fontColour == 'magenta' else \ + 'ffa500' if textstyle.fontColour == 'orange' else \ + 'ff00cb' if textstyle.fontColour == 'pink' else \ + '40e0d0' if textstyle.fontColour == 'turquoise' else \ + 'ee82ee' if textstyle.fontColour == 'violet' else \ + 'ffff00' if textstyle.fontColour == 'yellow' else \ + None + if rgb: + kwargs['font_color'] = rgb + font_style = [] + if textstyle.italic: + font_style.append('italics') + if textstyle.underlined: + font_style.append('underline') + for att in ('bold', 'smallCaps', 'strikethrough', 'subscript', 'superscript'): + if getattr(textstyle, att): + font_style.append(att.lower()) + if font_style: + kwargs['font_style'] = ' '.join(font_style) + # TODO kerning + # TODO underlineStyle + # TODO bgColour + # TODO bgColourRgb + # TODO reverseVideo + # TODO xHeight + # TODO letterSpaced + return self.get_style_id(**kwargs) + + def to_xml(self, alto_styles): + for style_id, style in self.styles.items(): + el_style = ET.SubElement(alto_styles, self.output_element) + el_style.set('ID', style_id) + for k, v in style.items(): + if v != 'None': + el_style.set(k.replace('_', '').upper(), v) + +class ParagraphStyleManager(TextStylesManager): + + def __ini__(self): + super().__init__() + self.fields = ('align', 'left', 'right', 'line_space', 'first_line') + self.output_element = 'ParagraphStyle' diff --git a/ocrd_page_to_alto/utils.py b/ocrd_page_to_alto/utils.py index aef60c6..f6c7d8c 100644 --- a/ocrd_page_to_alto/utils.py +++ b/ocrd_page_to_alto/utils.py @@ -1,6 +1,9 @@ from lxml import etree as ET from ocrd_utils import xywh_from_points +def setxml(el, name, val): + el.set(name, str(val)) + def set_alto_xywh_from_coords(reg_alto, reg_page, classes=None): if classes is None: classes = ['HEIGHT', 'WIDTH', 'HPOS', 'VPOS'] @@ -21,7 +24,3 @@ def set_alto_shape_from_coords(reg_alto, reg_page): def set_alto_id_from_page_id(reg_alto, reg_page): setxml(reg_alto, 'ID', reg_page.id) - -def setxml(el, name, val): - el.set(name, str(val)) - diff --git a/tests/test_styles.py b/tests/test_styles.py new file mode 100644 index 0000000..4f485a3 --- /dev/null +++ b/tests/test_styles.py @@ -0,0 +1,27 @@ +from pytest import raises, main, fixture +from lxml import etree as ET +from ocrd_models.ocrd_page import TextStyleType, to_xml + +from ocrd_page_to_alto.styles import TextStylesManager + +def test_styles_id(): + m = TextStylesManager() + assert m.get_style_id(font_family='Foo') == 'Foo---None---None---None---None---None' + assert m.styles['Foo---None---None---None---None---None']['font_family'] == 'Foo' + +def test_styles_to_xml(): + m = TextStylesManager() + m.get_style_id(font_family='Foo Serif') + el = ET.Element('Styles') + m.to_xml(el) + assert ET.tostring(el).decode('utf-8') == '' + assert m.styles['Foo%20Serif---None---None---None---None---None']['font_family'] == 'Foo Serif' + +def test_styles_from_textstyle(): + m = TextStylesManager() + textstyle = TextStyleType(fontFamily='Times New Roman', serif=True, textColourRgb=6559300) + print(m.from_textstyle(textstyle)) + assert 0 + +if __name__ == "__main__": + main([__file__])