From 64c3f588cbea3bbfd5a23612a4337c773b6a1a38 Mon Sep 17 00:00:00 2001 From: Daniel Cloud Date: Mon, 24 Feb 2014 11:33:52 -0500 Subject: [PATCH] Add core ShapelyFormatter and WKFormatter, which outputs WKT which may or not be valid. So #2 is far along now. --- hocrgeo/formatters/__init__.py | 2 ++ hocrgeo/formatters/core.py | 42 ++++++++++++++++++++++++++++++++++ hocrgeo/formatters/wk.py | 13 +++++++++++ requirements.txt | 2 +- 4 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 hocrgeo/formatters/core.py create mode 100644 hocrgeo/formatters/wk.py diff --git a/hocrgeo/formatters/__init__.py b/hocrgeo/formatters/__init__.py index e69de29..eb81445 100644 --- a/hocrgeo/formatters/__init__.py +++ b/hocrgeo/formatters/__init__.py @@ -0,0 +1,2 @@ +from hocrgeo.formatters.core import ShapelyFormatter +from hocrgeo.formatters.wk import WKFormatter diff --git a/hocrgeo/formatters/core.py b/hocrgeo/formatters/core.py new file mode 100644 index 0000000..3c5499a --- /dev/null +++ b/hocrgeo/formatters/core.py @@ -0,0 +1,42 @@ +from shapely.geometry import Polygon, box + +from shapely import speedups + +if speedups.available: + speedups.enable() + +class ShapelyFormatter: + '''Formats a HOCRDocument into shapely geometry''' + def __init__(self, hocr_document=None): + if hocr_document: + self.parse_hocr(hocr_document) + + def parse_hocr(self, hocr_document): + ''' + Parse a hocr_document created by from hocrgeo.parsers.hocr.HOCRParser + + :param hocr_document: instance of hocrparser.document. + ''' + + def _extract_polys_from_feature_tree(polygons, root, feature_keys): + def _poly_from_object(obj): + bbox = obj.get('bbox', None) + if bbox: + poly = box(bbox.get('x0'), bbox.get('y0'), bbox.get('x1'), bbox.get('y1')) + return poly + return None + + features = root.get(feature_keys[0]) + for f in features: + poly = _poly_from_object(f) + polygons.append(poly) + + child_keys = feature_keys[1:] + if child_keys: + _extract_polys_from_feature_tree(polygons, f, child_keys) + + self._polygons = [] + + features = ('pages', 'careas', 'paragraphs', 'lines', 'words') + + _extract_polys_from_feature_tree(self._polygons, hocr_document, features) diff --git a/hocrgeo/formatters/wk.py b/hocrgeo/formatters/wk.py new file mode 100644 index 0000000..60f9661 --- /dev/null +++ b/hocrgeo/formatters/wk.py @@ -0,0 +1,13 @@ +from shapely.geometry import Polygon, box + +from hocrgeo.formatters.core import ShapelyFormatter + +class WKFormatter(ShapelyFormatter): + """Formats a HOCRParser document as well-known text""" + + @property + def wkt(self): + if self._polygons: + return u'\n'.join([p.wkt for p in self._polygons]) + return None + diff --git a/requirements.txt b/requirements.txt index c6f0a25..bfe5495 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ chardet lxml beautifulsoup4 -geojson \ No newline at end of file +Shapely \ No newline at end of file