Skip to content

Commit

Permalink
Add core ShapelyFormatter and WKFormatter, which outputs WKT which ma…
Browse files Browse the repository at this point in the history
…y or not be valid. So #2 is far along now.
  • Loading branch information
dcloud committed Feb 24, 2014
1 parent 69d2c5e commit 64c3f58
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 1 deletion.
2 changes: 2 additions & 0 deletions hocrgeo/formatters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from hocrgeo.formatters.core import ShapelyFormatter
from hocrgeo.formatters.wk import WKFormatter
42 changes: 42 additions & 0 deletions hocrgeo/formatters/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from shapely.geometry import Polygon, box

from shapely import speedups

if speedups.available:
speedups.enable()

class ShapelyFormatter:
'''Formats a HOCRDocument into shapely geometry'''
def __init__(self, hocr_document=None):
if hocr_document:
self.parse_hocr(hocr_document)

def parse_hocr(self, hocr_document):
'''
Parse a hocr_document created by from hocrgeo.parsers.hocr.HOCRParser
:param hocr_document: instance of hocrparser.document.
'''

def _extract_polys_from_feature_tree(polygons, root, feature_keys):
def _poly_from_object(obj):
bbox = obj.get('bbox', None)
if bbox:
poly = box(bbox.get('x0'), bbox.get('y0'), bbox.get('x1'), bbox.get('y1'))
return poly
return None

features = root.get(feature_keys[0])
for f in features:
poly = _poly_from_object(f)
polygons.append(poly)

child_keys = feature_keys[1:]
if child_keys:
_extract_polys_from_feature_tree(polygons, f, child_keys)

self._polygons = []

features = ('pages', 'careas', 'paragraphs', 'lines', 'words')

_extract_polys_from_feature_tree(self._polygons, hocr_document, features)
13 changes: 13 additions & 0 deletions hocrgeo/formatters/wk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from shapely.geometry import Polygon, box

from hocrgeo.formatters.core import ShapelyFormatter

class WKFormatter(ShapelyFormatter):
"""Formats a HOCRParser document as well-known text"""

@property
def wkt(self):
if self._polygons:
return u'\n'.join([p.wkt for p in self._polygons])
return None

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
chardet
lxml
beautifulsoup4
geojson
Shapely

0 comments on commit 64c3f58

Please sign in to comment.