Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repair fix coords #43

Merged
merged 12 commits into from
Sep 14, 2020
109 changes: 35 additions & 74 deletions ocrd_segment/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scipy.ndimage import filters, morphology
import cv2
import numpy as np
from shapely.geometry import Polygon, LineString
from shapely.geometry import asPolygon, Polygon, LineString

from ocrd import Processor
from ocrd_utils import (
Expand All @@ -23,8 +23,6 @@
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
LabelType, LabelsType,
MetadataItemType,
to_xml
)
from ocrd_models.ocrd_page_generateds import (
Expand All @@ -36,6 +34,7 @@
UnorderedGroupIndexedType,
ReadingOrderType
)
from ocrd_validators.page_validator import PageValidator
from .config import OCRD_TOOL

TOOL = 'ocrd-segment-repair'
Expand Down Expand Up @@ -68,23 +67,17 @@ def process(self):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
page = pcgts.get_Page()
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))


#
# validate segmentation (warn of children extending beyond their parents)
#
self.validate_coords(page, page_id)
report = PageValidator.validate(ocrd_page=pcgts,
page_textequiv_consistency='off',
check_baseline=False)
if not report.is_valid:
LOG.warning(report.to_xml())

#
# sanitize region segmentation (shrink to hull of lines)
Expand Down Expand Up @@ -240,11 +233,13 @@ def sanitize_page(self, page, page_id):
LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"',
i, area, total_area, region.id)
continue
# simplify shape:
# simplify shape (until valid):
# can produce invalid (self-intersecting) polygons:
#polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
polygon = contour[:, 0, ::] # already ordered x,y
polygon = Polygon(polygon).simplify(1).exterior.coords
polygon = Polygon(polygon).simplify(1)
polygon = make_valid(polygon)
polygon = polygon.exterior.coords[:-1] # keep open
if len(polygon) < 4:
LOG.warning('Ignoring contour %d less than 4 points in region "%s"',
i, region.id)
Expand All @@ -259,61 +254,6 @@ def sanitize_page(self, page, page_id):
LOG.info('Using new coordinates for region "%s"', region.id)
region.get_Coords().points = points_from_polygon(region_polygon)

def validate_coords(self, page, page_id):
valid = True
regions = page.get_TextRegion()
if page.get_Border():
other_regions = (
page.get_AdvertRegion() +
page.get_ChartRegion() +
page.get_ChemRegion() +
page.get_GraphicRegion() +
page.get_ImageRegion() +
page.get_LineDrawingRegion() +
page.get_MathsRegion() +
page.get_MusicRegion() +
page.get_NoiseRegion() +
page.get_SeparatorRegion() +
page.get_TableRegion() +
page.get_UnknownRegion())
for region in regions + other_regions:
if not _child_within_parent(region, page.get_Border()):
LOG.warning('Region "%s" extends beyond Border of page "%s"',
region.id, page_id)
valid = False
for region in regions:
lines = region.get_TextLine()
for line in lines:
if not _child_within_parent(line, region):
LOG.warning('Line "%s" extends beyond region "%s" on page "%s"',
line.id, region.id, page_id)
valid = False
if line.get_Baseline():
baseline = LineString(polygon_from_points(line.get_Baseline().points))
linepoly = Polygon(polygon_from_points(line.get_Coords().points))
if not baseline.within(linepoly):
LOG.warning('Baseline extends beyond line "%s" in region "%s" on page "%s"',
line.id, region.id, page_id)
valid = False
words = line.get_Word()
for word in words:
if not _child_within_parent(word, line):
LOG.warning('Word "%s" extends beyond line "%s" in region "%s" on page "%s"',
word.id, line.id, region.id, page_id)
valid = False
glyphs = word.get_Glyph()
for glyph in glyphs:
if not _child_within_parent(glyph, word):
LOG.warning('Glyph "%s" extends beyond word "%s" in line "%s" of region "%s" on page "%s"',
glyph.id, word.id, line.id, region.id, page_id)
valid = False
return valid

def _child_within_parent(child, parent):
child_poly = Polygon(polygon_from_points(child.get_Coords().points))
parent_poly = Polygon(polygon_from_points(parent.get_Coords().points))
return child_poly.within(parent_poly)

def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging):
wait_for_deletion = list()
reading_order = dict()
Expand Down Expand Up @@ -354,7 +294,13 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi
# and use-cases in the future
superpoly = Polygon(polygon_from_points(superreg.get_Coords().points))
superpoly = superpoly.union(poly)
superreg.get_Coords().points = points_from_polygon(superpoly.exterior.coords)
if superpoly.type == 'MultiPolygon':
superpoly = superpoly.convex_hull
if superpoly.minimum_clearance < 1.0:
superpoly = asPolygon(np.round(superpoly.exterior.coords))
superpoly = make_valid(superpoly)
superpoly = superpoly.exterior.coords[:-1] # keep open
superreg.get_Coords().points = points_from_polygon(superpoly)
# FIXME should we merge/mix attributes and features?
if region.get_orientation() != superreg.get_orientation():
LOG.warning('Merging region "%s" with orientation %f into "%s" with %f',
Expand Down Expand Up @@ -399,3 +345,18 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi
if region.parent_object_:
# remove in-place
region.parent_object_.get_TextRegion().remove(region)

def make_valid(polygon):
kba marked this conversation as resolved.
Show resolved Hide resolved
"""Ensures shapely.geometry.Polygon object is valid by repeated simplification"""
for split in range(1, len(polygon.exterior.coords)-1):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
for tolerance in range(1, int(polygon.area)):
if polygon.is_valid:
break
# simplification may require a larger tolerance
polygon = polygon.simplify(tolerance)
return polygon
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ocrd >= 2.13.1
bertsky marked this conversation as resolved.
Show resolved Hide resolved
shapely
shapely >= 1.7.1
scikit-image
numpy