-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add core ShapelyFormatter and WKFormatter, which outputs WKT which ma…
…y or not be valid. So #2 is far along now.
- Loading branch information
Showing
4 changed files
with
58 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from hocrgeo.formatters.core import ShapelyFormatter | ||
from hocrgeo.formatters.wk import WKFormatter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from shapely.geometry import Polygon, box | ||
|
||
from shapely import speedups | ||
|
||
if speedups.available: | ||
speedups.enable() | ||
|
||
class ShapelyFormatter: | ||
'''Formats a HOCRDocument into shapely geometry''' | ||
def __init__(self, hocr_document=None): | ||
if hocr_document: | ||
self.parse_hocr(hocr_document) | ||
|
||
def parse_hocr(self, hocr_document): | ||
''' | ||
Parse a hocr_document created by from hocrgeo.parsers.hocr.HOCRParser | ||
:param hocr_document: instance of hocrparser.document. | ||
''' | ||
|
||
def _extract_polys_from_feature_tree(polygons, root, feature_keys): | ||
def _poly_from_object(obj): | ||
bbox = obj.get('bbox', None) | ||
if bbox: | ||
poly = box(bbox.get('x0'), bbox.get('y0'), bbox.get('x1'), bbox.get('y1')) | ||
return poly | ||
return None | ||
|
||
features = root.get(feature_keys[0]) | ||
for f in features: | ||
poly = _poly_from_object(f) | ||
polygons.append(poly) | ||
|
||
child_keys = feature_keys[1:] | ||
if child_keys: | ||
_extract_polys_from_feature_tree(polygons, f, child_keys) | ||
|
||
self._polygons = [] | ||
|
||
features = ('pages', 'careas', 'paragraphs', 'lines', 'words') | ||
|
||
_extract_polys_from_feature_tree(self._polygons, hocr_document, features) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from shapely.geometry import Polygon, box | ||
|
||
from hocrgeo.formatters.core import ShapelyFormatter | ||
|
||
class WKFormatter(ShapelyFormatter): | ||
"""Formats a HOCRParser document as well-known text""" | ||
|
||
@property | ||
def wkt(self): | ||
if self._polygons: | ||
return u'\n'.join([p.wkt for p in self._polygons]) | ||
return None | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
chardet | ||
lxml | ||
beautifulsoup4 | ||
geojson | ||
Shapely |