From 57fd1e7c6404220ea62e6f9d31b435743deabbd0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:27:20 +0100 Subject: [PATCH 01/23] tiseg: fix typo --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 442b0d8..334ba07 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -170,7 +170,7 @@ def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, m image_part = array((1-I*Iseedfill), dtype=int) text_part = array((1-I*(1-Iseedfill)), dtype=int) - bin_array = array(255*(text_part>ocrolib.midrange(img_part)),'B') + bin_array = array(255*(text_part>ocrolib.midrange(image_part)),'B') text_part = ocrolib.array2pil(bin_array) bin_array = array(255*(text_part>ocrolib.midrange(text_part)),'B') From bab56a6f73fc9d1d4462511a4ccfc718c117273f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:30:31 +0100 Subject: [PATCH 02/23] tiseg: remove trailing whitespace --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 70 ++++++++++---------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 334ba07..eb2b0b8 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -25,14 +25,14 @@ from ocrd import Processor from ocrd_modelfactory import page_from_file from ocrd_utils import ( - getLogger, - concat_padded, + getLogger, + concat_padded, MIMETYPE_PAGE, coordinates_for_segment, points_from_polygon, make_file_id, assert_file_grp_cardinality, - ) +) import click from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor @@ -61,14 +61,14 @@ def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] - + model = None if self.parameter['use_deeplr']: - + model_weights = self.resolve_resource(self.parameter['seg_weights']) - + if not Path(model_weights).is_file(): - LOG.error("""\ + LOG.error(""" Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter points to the local model weights path. """ % model_weights) @@ -78,27 +78,26 @@ def process(self): #model.load_weights(model_weights) model = load_model(model_weights) LOG.info('Segmentation Model loaded') - + for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID - + pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) - + if self.parameter['use_deeplr']: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized,deskewed,cropped') else: - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized,deskewed,cropped') - + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized,deskewed,cropped') + if oplevel == 'page': self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break - file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -110,18 +109,18 @@ def process(self): local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), ) - + def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, model): LOG = getLogger('OcrdAnybaseocrTiseg') - + if model: - + I = ocrolib.pil2array(page_image.resize((800, 1024), Image.ANTIALIAS)) I = np.array(I)[np.newaxis, :, :, :] LOG.info('I shape %s', I.shape) if len(I.shape)<3: print('Wrong input shape. Image should have 3 channel') - + # get prediction #out = model.predict_segmentation( # inp=I, @@ -132,22 +131,22 @@ def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, m text_part = np.ones(out.shape) text_part[np.where(out==1)] = 0 - + image_part = np.ones(out.shape) image_part[np.where(out==2)] = 0 - + image_part = array(255*(image_part), 'B') image_part = ocrolib.array2pil(image_part) text_part = array(255*(text_part), 'B') text_part = ocrolib.array2pil(text_part) - + text_part = text_part.resize(page_image.size, Image.BICUBIC) image_part = image_part.resize(page_image.size, Image.BICUBIC) - + else: I = ocrolib.pil2array(page_image) - + if len(I.shape) > 2: I = np.mean(I, 2) I = 1-I/I.max() @@ -168,31 +167,31 @@ def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, m # Write Text and Non-Text images image_part = array((1-I*Iseedfill), dtype=int) - text_part = array((1-I*(1-Iseedfill)), dtype=int) + text_part = array((1-I*(1-Iseedfill)), dtype=int) bin_array = array(255*(text_part>ocrolib.midrange(image_part)),'B') - text_part = ocrolib.array2pil(bin_array) - + text_part = ocrolib.array2pil(bin_array) + bin_array = array(255*(text_part>ocrolib.midrange(text_part)),'B') - image_part = ocrolib.array2pil(bin_array) - - + image_part = ocrolib.array2pil(bin_array) + + file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file(image_part, file_id+"_img", page_id=page_id, file_grp=self.output_file_grp, - ) + ) page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features']+',non_text')) - + page_xywh['features'] += ',clipped' file_path = self.workspace.save_image_file(text_part, file_id+"_txt", page_id=page_id, file_grp=self.output_file_grp, - ) - page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features'])) - + ) + page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features'])) + def pixMorphSequence_mask_seed_fill_holes(self, I): Imask = self.reduction_T_1(I) Imask = self.reduction_T_1(Imask) @@ -254,7 +253,7 @@ def expansion(self, I, rows_cols): A[:, 2:4*c:4] = A[:, 0:4*c:4] A[:, 3:4*c:4] = A[:, 0:4*c:4] return A - + def alpha_shape(self, coords, alpha): import shapely.geometry as geometry from shapely.ops import cascaded_union, polygonize @@ -283,7 +282,7 @@ def add_edge(edges, edge_points, coords, i, j): return edges.add( (i, j) ) edge_points.append(coords[ [i, j] ]) - + tri = Delaunay(coords) edges = set() edge_points = [] @@ -313,7 +312,6 @@ def add_edge(edges, edge_points, coords, i, j): triangles = list(polygonize(m)) return cascaded_union(triangles), edge_points - @click.command() @ocrd_cli_options def cli(*args, **kwargs): From 624b32e53043cc1c3a2de8abb6c6e91b9e973329 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:40:05 +0100 Subject: [PATCH 03/23] tiseg: unused parameters --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 7 +------ ocrd_anybaseocr/ocrd-tool.json | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index eb2b0b8..647c1ad 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -60,7 +60,6 @@ def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - oplevel = self.parameter['operation_level'] model = None if self.parameter['use_deeplr']: @@ -93,11 +92,7 @@ def process(self): else: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized,deskewed,cropped') - if oplevel == 'page': - self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) - else: - LOG.warning('Operation level %s, but should be "page".', oplevel) - break + self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index 0622c22..73680ef 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -91,12 +91,19 @@ "steps": ["layout/segmentation/text-image"], "description": "Separates the text and non-text elements with anyBaseOCR. Outputs clipped versions of the input image as AlternativeImage containing either only text or non-text elements.", "parameters": { - "use_deeplr": {"type":"boolean", "default":true, "description": "use deep learning model"}, - "seg_weights": {"type":"string", "default":"seg_model.hdf5", "description":"path to weights file", "required":false}, - "classes": {"type":"integer", "default":3, "description":"number of classes" }, - "width" : {"type":"integer", "default":1024, "description":"input image height"}, - "height" : {"type":"integer", "default":800, "description":"input image width"}, - "operation_level": {"type": "string", "enum": ["page","region", "line"], "default": "page","description": "PAGE XML hierarchy level to operate on"} + "use_deeplr": { + "type":"boolean", + "default":true, + "description": "Whether to use deep learning model (UNet pixel classifier) instead of rule-based implementation (multi-resolution morphology)." + }, + "seg_weights": { + "type":"string", + "format":"uri", + "content-type": "application/x-hdf;subtype=bag", + "cacheable": true, + "default":"seg_model.hdf5", + "description":"Path to weights file for deep learning model when use_deeplr is true." + } } }, "ocrd-anybaseocr-textline": { From 99c457b4280a7d3f2825055dffaf1c0da6d3e7a9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:45:01 +0100 Subject: [PATCH 04/23] tiseg (legacy): do not enforce deskewed/cropped --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 647c1ad..331527e 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -88,11 +88,12 @@ def process(self): LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) if self.parameter['use_deeplr']: - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized,deskewed,cropped') + page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized,deskewed,cropped') else: - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized,deskewed,cropped') + # _should_ also be deskewed and cropped, but no need to enforce that here + page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized') - self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) + self._process_segment(page, page_image, page_coords, page_id, input_file, model) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -105,7 +106,7 @@ def process(self): content=to_xml(pcgts).encode('utf-8'), ) - def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, model): + def _process_segment(self, page, page_image, page_coords, page_id, input_file, model): LOG = getLogger('OcrdAnybaseocrTiseg') if model: @@ -177,15 +178,16 @@ def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n, m page_id=page_id, file_grp=self.output_file_grp, ) - page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features']+',non_text')) + page.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments=page_coords['features'] + ',non_text')) - page_xywh['features'] += ',clipped' file_path = self.workspace.save_image_file(text_part, file_id+"_txt", page_id=page_id, file_grp=self.output_file_grp, ) - page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features'])) + page.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments=page_coords['features'] + ',clipped')) def pixMorphSequence_mask_seed_fill_holes(self, I): Imask = self.reduction_T_1(I) From 62a97656a7e80c89a520bfc5207ae9b6358d49dd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:45:55 +0100 Subject: [PATCH 05/23] tiseg (legacy): fix image pageId --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 331527e..a3c021a 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -175,7 +175,7 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file, m file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file(image_part, file_id+"_img", - page_id=page_id, + page_id=input_file.pageId, file_grp=self.output_file_grp, ) page.add_AlternativeImage(AlternativeImageType( @@ -183,7 +183,7 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file, m file_path = self.workspace.save_image_file(text_part, file_id+"_txt", - page_id=page_id, + page_id=input_file.pageId, file_grp=self.output_file_grp, ) page.add_AlternativeImage(AlternativeImageType( From 21a2cd9e4facafc80e0b274a2d811040d4d1362f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:54:06 +0100 Subject: [PATCH 06/23] tiseg: clean imports and import order --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 78 ++++++++++---------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index a3c021a..3bc516a 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -8,22 +8,23 @@ # URL - https://www.dfki.de/fileadmin/user_upload/import/9512_ICDAR2017_anyOCR.pdf -from scipy import ones, zeros, array, where, shape, ndimage, logical_or, logical_and import copy -from pylab import unique -import ocrolib import json -from PIL import Image -import sys import os +from pathlib import Path +import sys +import math +import click +from PIL import Image +from scipy import ndimage import numpy as np import shapely -import cv2 -import math -from ..constants import OCRD_TOOL -from pathlib import Path +import ocrolib +from keras.models import load_model +#from keras_segmentation.models.unet import resnet50_unet from ocrd import Processor from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import to_xml, AlternativeImageType from ocrd_utils import ( getLogger, concat_padded, @@ -33,13 +34,8 @@ make_file_id, assert_file_grp_cardinality, ) -import click from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor - -from keras.models import load_model -#from keras_segmentation.models.unet import resnet50_unet - -from ocrd_models.ocrd_page import to_xml, AlternativeImageType +from ..constants import OCRD_TOOL TOOL = 'ocrd-anybaseocr-tiseg' @@ -131,10 +127,10 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file, m image_part = np.ones(out.shape) image_part[np.where(out==2)] = 0 - image_part = array(255*(image_part), 'B') + image_part = np.array(255*(image_part), 'B') image_part = ocrolib.array2pil(image_part) - text_part = array(255*(text_part), 'B') + text_part = np.array(255*(text_part), 'B') text_part = ocrolib.array2pil(text_part) text_part = text_part.resize(page_image.size, Image.BICUBIC) @@ -155,20 +151,20 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file, m Iseedfill = self.pixSeedfillBinary(Imask, Iseed) # Dilation of Iseedfill - mask = ones((3, 3)) + mask = np.ones((3, 3)) Iseedfill = ndimage.binary_dilation(Iseedfill, mask) # Expansion of Iseedfill to become equal in size of I Iseedfill = self.expansion(Iseedfill, (rows, cols)) # Write Text and Non-Text images - image_part = array((1-I*Iseedfill), dtype=int) - text_part = array((1-I*(1-Iseedfill)), dtype=int) + image_part = np.array((1-I*Iseedfill), dtype=int) + text_part = np.array((1-I*(1-Iseedfill)), dtype=int) - bin_array = array(255*(text_part>ocrolib.midrange(image_part)),'B') + bin_array = np.array(255*(text_part>ocrolib.midrange(image_part)),'B') text_part = ocrolib.array2pil(bin_array) - bin_array = array(255*(text_part>ocrolib.midrange(text_part)),'B') + bin_array = np.array(255*(text_part>ocrolib.midrange(text_part)),'B') image_part = ocrolib.array2pil(bin_array) @@ -195,53 +191,53 @@ def pixMorphSequence_mask_seed_fill_holes(self, I): Imask = ndimage.binary_fill_holes(Imask) Iseed = self.reduction_T_4(Imask) Iseed = self.reduction_T_3(Iseed) - mask = array(ones((5, 5)), dtype=int) + mask = np.array(np.ones((5, 5)), dtype=int) Iseed = ndimage.binary_opening(Iseed, mask) Iseed = self.expansion(Iseed, Imask.shape) return Imask, Iseed def pixSeedfillBinary(self, Imask, Iseed): Iseedfill = copy.deepcopy(Iseed) - s = ones((3, 3)) + s = np.ones((3, 3)) Ijmask, k = ndimage.label(Imask, s) Ijmask2 = Ijmask * Iseedfill - A = list(unique(Ijmask2)) + A = list(np.unique(Ijmask2)) A.remove(0) for i in range(0, len(A)): - x, y = where(Ijmask == A[i]) + x, y = np.where(Ijmask == A[i]) Iseedfill[x, y] = 1 return Iseedfill def reduction_T_1(self, I): - A = logical_or(I[0:-1:2, :], I[1::2, :]) - A = logical_or(A[:, 0:-1:2], A[:, 1::2]) + A = np.logical_or(I[0:-1:2, :], I[1::2, :]) + A = np.logical_or(A[:, 0:-1:2], A[:, 1::2]) return A def reduction_T_2(self, I): - A = logical_or(I[0:-1:2, :], I[1::2, :]) - A = logical_and(A[:, 0:-1:2], A[:, 1::2]) - B = logical_and(I[0:-1:2, :], I[1::2, :]) - B = logical_or(B[:, 0:-1:2], B[:, 1::2]) - C = logical_or(A, B) + A = np.logical_or(I[0:-1:2, :], I[1::2, :]) + A = np.logical_and(A[:, 0:-1:2], A[:, 1::2]) + B = np.logical_and(I[0:-1:2, :], I[1::2, :]) + B = np.logical_or(B[:, 0:-1:2], B[:, 1::2]) + C = np.logical_or(A, B) return C def reduction_T_3(self, I): - A = logical_or(I[0:-1:2, :], I[1::2, :]) - A = logical_and(A[:, 0:-1:2], A[:, 1::2]) - B = logical_and(I[0:-1:2, :], I[1::2, :]) - B = logical_or(B[:, 0:-1:2], B[:, 1::2]) - C = logical_and(A, B) + A = np.logical_or(I[0:-1:2, :], I[1::2, :]) + A = np.logical_and(A[:, 0:-1:2], A[:, 1::2]) + B = np.logical_and(I[0:-1:2, :], I[1::2, :]) + B = np.logical_or(B[:, 0:-1:2], B[:, 1::2]) + C = np.logical_and(A, B) return C def reduction_T_4(self, I): - A = logical_and(I[0:-1:2, :], I[1::2, :]) - A = logical_and(A[:, 0:-1:2], A[:, 1::2]) + A = np.logical_and(I[0:-1:2, :], I[1::2, :]) + A = np.logical_and(A[:, 0:-1:2], A[:, 1::2]) return A def expansion(self, I, rows_cols): r, c = I.shape rows, cols = rows_cols - A = zeros((rows, cols)) + A = np.zeros((rows, cols)) A[0:4*r:4, 0:4*c:4] = I A[1:4*r:4, :] = A[0:4*r:4, :] A[2:4*r:4, :] = A[0:4*r:4, :] From 82a0055c44dcd226967829f1762c8b2911c646d0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 01:59:33 +0100 Subject: [PATCH 07/23] tiseg (ML): load during init/setup instead of process --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 41 +++++++++----------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 3bc516a..d27b45b 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -45,7 +45,21 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(OcrdAnybaseocrTiseg, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp') and hasattr(self, 'parameter'): + # processing context + self.setup() + def setup(self): + LOG = getLogger('OcrdAnybaseocrTiseg') + self.model = None + if self.parameter['use_deeplr']: + + model_weights = self.resolve_resource(self.parameter['seg_weights']) + #model = resnet50_unet(n_classes=self.parameter['classes'], input_height=self.parameter['height'], input_width=self.parameter['width']) + #model.load_weights(model_weights) + self.model = load_model(model_weights) + LOG.info('Loaded segmentation model') + def crop_image(self, image_path, crop_region): img = Image.open(image_path) cropped = img.crop(crop_region) @@ -57,23 +71,6 @@ def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - model = None - if self.parameter['use_deeplr']: - - model_weights = self.resolve_resource(self.parameter['seg_weights']) - - if not Path(model_weights).is_file(): - LOG.error(""" - Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter - points to the local model weights path. - """ % model_weights) - sys.exit(1) - - #model = resnet50_unet(n_classes=self.parameter['classes'], input_height=self.parameter['height'], input_width=self.parameter['width']) - #model.load_weights(model_weights) - model = load_model(model_weights) - LOG.info('Segmentation Model loaded') - for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID @@ -89,7 +86,7 @@ def process(self): # _should_ also be deskewed and cropped, but no need to enforce that here page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized') - self._process_segment(page, page_image, page_coords, page_id, input_file, model) + self._process_segment(page, page_image, page_coords, page_id, input_file) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -102,10 +99,10 @@ def process(self): content=to_xml(pcgts).encode('utf-8'), ) - def _process_segment(self, page, page_image, page_coords, page_id, input_file, model): + def _process_segment(self, page, page_image, page_coords, page_id, input_file): LOG = getLogger('OcrdAnybaseocrTiseg') - if model: + if self.model: I = ocrolib.pil2array(page_image.resize((800, 1024), Image.ANTIALIAS)) I = np.array(I)[np.newaxis, :, :, :] @@ -114,11 +111,11 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file, m print('Wrong input shape. Image should have 3 channel') # get prediction - #out = model.predict_segmentation( + #out = self.model.predict_segmentation( # inp=I, # out_fname="/tmp/out.png" #) - out = model.predict(I) + out = self.model.predict(I) out = out.reshape((2048, 1600, 3)).argmax(axis=2) text_part = np.ones(out.shape) From eb6c98fb0ace3b14c666616e385a60003f54fef0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 02:05:40 +0100 Subject: [PATCH 08/23] tiseg (ML): clean unused function --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index d27b45b..1575cb7 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -60,18 +60,13 @@ def setup(self): self.model = load_model(model_weights) LOG.info('Loaded segmentation model') - def crop_image(self, image_path, crop_region): - img = Image.open(image_path) - cropped = img.crop(crop_region) - return cropped - def process(self): LOG = getLogger('OcrdAnybaseocrTiseg') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - for (n, input_file) in enumerate(self.input_files): + for input_file in self.input_files: page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -81,10 +76,12 @@ def process(self): LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) if self.parameter['use_deeplr']: - page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized,deskewed,cropped') + kwargs = {'feature_filter': 'binarized,deskewed,cropped'} else: # _should_ also be deskewed and cropped, but no need to enforce that here - page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized') + kwargs = {'feature_selector': 'binarized'} + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, **kwargs) self._process_segment(page, page_image, page_coords, page_id, input_file) From 96ec2eedcd2ffd5da30121639c692d506bacb175 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 02:17:45 +0100 Subject: [PATCH 09/23] tiseg (legacy): fix image vs text part --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 1575cb7..0968719 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -158,10 +158,9 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): bin_array = np.array(255*(text_part>ocrolib.midrange(image_part)),'B') text_part = ocrolib.array2pil(bin_array) - bin_array = np.array(255*(text_part>ocrolib.midrange(text_part)),'B') + bin_array = np.array(255*(image_part>ocrolib.midrange(text_part)),'B') image_part = ocrolib.array2pil(bin_array) - file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file(image_part, file_id+"_img", From 9105973379879c0e55f70e1e2dc2776e65d00643 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 02:28:16 +0100 Subject: [PATCH 10/23] tiseg (legacy): fix image vs background --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 0968719..14e05b4 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -152,17 +152,14 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): Iseedfill = self.expansion(Iseedfill, (rows, cols)) # Write Text and Non-Text images - image_part = np.array((1-I*Iseedfill), dtype=int) - text_part = np.array((1-I*(1-Iseedfill)), dtype=int) + nontext_part = np.array(255*(1-I*Iseedfill), dtype='B') + text_part = np.array(255*(1-I*(1-Iseedfill)), dtype='B') - bin_array = np.array(255*(text_part>ocrolib.midrange(image_part)),'B') - text_part = ocrolib.array2pil(bin_array) - - bin_array = np.array(255*(image_part>ocrolib.midrange(text_part)),'B') - image_part = ocrolib.array2pil(bin_array) + nontext_image = ocrolib.array2pil(nontext_part) + text_image = ocrolib.array2pil(text_part) file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file(image_part, + file_path = self.workspace.save_image_file(nontext_image, file_id+"_img", page_id=input_file.pageId, file_grp=self.output_file_grp, @@ -170,7 +167,7 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_coords['features'] + ',non_text')) - file_path = self.workspace.save_image_file(text_part, + file_path = self.workspace.save_image_file(text_image, file_id+"_txt", page_id=input_file.pageId, file_grp=self.output_file_grp, From 25fc8e135e4255380ec49f5ceb689f020e87185d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 1 Feb 2021 03:10:51 +0100 Subject: [PATCH 11/23] tiseg: show class counts --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 23 ++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 14e05b4..4b2d044 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -115,20 +115,19 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): out = self.model.predict(I) out = out.reshape((2048, 1600, 3)).argmax(axis=2) - text_part = np.ones(out.shape) + text_part = 255 * np.ones(out.shape, 'B') text_part[np.where(out==1)] = 0 + LOG.info('text: %d%', 100 * (1 - np.count_nonzero(text_part) / np.prod(out.shape))) - image_part = np.ones(out.shape) + image_part = 255 * np.ones(out.shape, 'B') image_part[np.where(out==2)] = 0 + LOG.info('image: %d%', 100 * (1 - np.count_nonzero(image_part) / np.prod(out.shape))) - image_part = np.array(255*(image_part), 'B') image_part = ocrolib.array2pil(image_part) - - text_part = np.array(255*(text_part), 'B') text_part = ocrolib.array2pil(text_part) - text_part = text_part.resize(page_image.size, Image.BICUBIC) image_part = image_part.resize(page_image.size, Image.BICUBIC) + text_part = text_part.resize(page_image.size, Image.BICUBIC) else: I = ocrolib.pil2array(page_image) @@ -152,14 +151,16 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): Iseedfill = self.expansion(Iseedfill, (rows, cols)) # Write Text and Non-Text images - nontext_part = np.array(255*(1-I*Iseedfill), dtype='B') + image_part = np.array(255*(1-I*Iseedfill), dtype='B') text_part = np.array(255*(1-I*(1-Iseedfill)), dtype='B') + LOG.info('text: %d%', 100 * (1 - np.count_nonzero(text_part) / np.prod(I.shape))) + LOG.info('image: %d%', 100 * (1 - np.count_nonzero(image_part) / np.prod(I.shape))) - nontext_image = ocrolib.array2pil(nontext_part) - text_image = ocrolib.array2pil(text_part) + image_part = ocrolib.array2pil(image_part) + text_part = ocrolib.array2pil(text_part) file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file(nontext_image, + file_path = self.workspace.save_image_file(image_part, file_id+"_img", page_id=input_file.pageId, file_grp=self.output_file_grp, @@ -167,7 +168,7 @@ def _process_segment(self, page, page_image, page_coords, page_id, input_file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_coords['features'] + ',non_text')) - file_path = self.workspace.save_image_file(text_image, + file_path = self.workspace.save_image_file(text_part, file_id+"_txt", page_id=input_file.pageId, file_grp=self.output_file_grp, From 665a8dd5ee6b4c1b8401627abb699c0a7ae580e6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 11:08:03 +0100 Subject: [PATCH 12/23] block-segmentation: resolve_resource already exits verbosely --- .../cli/ocrd_anybaseocr_block_segmentation.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 7a0696c..f10a7dc 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -89,17 +89,8 @@ def process(self): 'marginalia', 'footnote', 'footnote-continued', 'caption', 'endnote', 'footer', 'keynote', 'image', 'table', 'graphics'] - if not Path(model_weights).is_file(): - LOG.error("""\ - Block Segmentation model weights file was not found at '%s'. Make sure the `model_weights` parameter - points to the local model weights path. - """, model_weights) - sys.exit(1) - -# config = InferenceConfig(Config,DETECTION_MIN_CONFIDENCE) - config = InferenceConfig(confidence) -# config = InferenceConfig() + # TODO: allow selecting active class IDs mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) mrcnn_model.load_weights(str(model_weights), by_name=True) From 266756cfc98db4f27ff2b6827500cbaf7b157e46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 12:01:51 +0100 Subject: [PATCH 13/23] block-segmentation: proper class ID/name mapping --- .../cli/ocrd_anybaseocr_block_segmentation.py | 91 ++++++++++--------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index f10a7dc..d70e042 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -43,6 +43,26 @@ TOOL = 'ocrd-anybaseocr-block-segmentation' FALLBACK_IMAGE_GRP = 'OCR-D-IMG-BLOCK-SEGMENT' +CLASS_NAMES = ['BG', + 'page-number', + 'paragraph', + 'catch-word', + 'heading', + 'drop-capital', + 'signature-mark', + 'header', + 'marginalia', + 'footnote', + 'footnote-continued', + 'caption', + 'endnote', + 'footer', + 'keynote', + # not included in the provided models yet: + #'image', + #'table', + #'graphics' +] class InferenceConfig(Config): @@ -51,7 +71,7 @@ def __init__(self, confidence): NAME = "block" IMAGES_PER_GPU = 1 - NUM_CLASSES = 1 + 14 + NUM_CLASSES = len(CLASS_NAMES) # NAME = "block" # IMAGES_PER_GPU = 1 @@ -85,10 +105,6 @@ def process(self): confidence = self.parameter['DETECTION_MIN_CONFIDENCE'] # DETECTION_MIN_CONFIDENCE = Path(self.parameter['DETECTION_MIN_CONFIDENCE']) - class_names = ['BG', 'page-number', 'paragraph', 'catch-word', 'heading', 'drop-capital', 'signature-mark', 'header', - 'marginalia', 'footnote', 'footnote-continued', 'caption', 'endnote', 'footer', 'keynote', - 'image', 'table', 'graphics'] - config = InferenceConfig(confidence) # TODO: allow selecting active class IDs mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) @@ -152,7 +168,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, page_image.save('./checkthis.png') if len(img_array.shape) <= 2: img_array = np.stack((img_array,)*3, axis=-1) - results = mrcnn_model.detect([img_array], verbose=1) + results = mrcnn_model.detect([img_array], verbose=0) r = results[0] th = self.parameter['th'] @@ -224,18 +240,20 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, # checking for ymax case with vertical overlapping # along with y, check both for xmax and xmin if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1] and - ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] - and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) - and r['class_ids'][i] != 5): + ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or + (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or + (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and + r['class_ids'][i] != 5): r['rois'][i][2] = bbox[1] - 1 # checking for ymin now # along with y, check both for xmax and xmin if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1] and - ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] - and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) - and r['class_ids'][i] != 5): + ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or + (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or + (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and + r['class_ids'][i] != 5): r['rois'][i][0] = bbox[3] + 1 @@ -313,10 +331,11 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] + class_id = r['class_ids'][i] - if (min_y - 5) > width and r['class_ids'][i] == 2: + if (min_y - 5) > width and class_id == 2: min_y -= 5 - if (max_y + 10) < width and r['class_ids'][i] == 2: + if (max_y + 10) < width and class_id == 2: min_y += 10 # one change here to resolve flipped coordinates @@ -326,7 +345,8 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, if cut_region_polygon.is_empty: continue - cut_region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1]))][:-1] + cut_region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), + list(cut_region_polygon.exterior.coords.xy[1]))][:-1] # checking whether coordinates are flipped @@ -348,35 +368,24 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, page_id=page_id, file_grp=self.output_file_grp) - # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) - region_id = '%s_region%04d' % (page_id, i) - coords = CoordsType(region_points) - - # incase of imageRegion - if r['class_ids'][i] == 15: - image_region = ImageRegionType(custom='readingOrder {index:'+str(read_order)+';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) - # image_region.add_AlternativeImage(ai) + region_args = {'custom': 'readingOrder {index:'+str(read_order)+';}', + 'id': '%s_region%04d' % (page_id, i), + 'Coords': CoordsType(region_points)} + if class_id >= len(CLASS_NAMES): + raise Exception('Unexpected class id %d - model does not match' % class_id) + if CLASS_NAMES[class_id] == 'image': + image_region = ImageRegionType(**region_args) page.add_ImageRegion(image_region) - continue - if r['class_ids'][i] == 16: - table_region = TableRegionType(custom='readingOrder {index:'+str(read_order)+';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) - # table_region.add_AlternativeImage(ai) + elif CLASS_NAMES[class_id] == 'table': + table_region = TableRegionType(**region_args) page.add_TableRegion(table_region) - continue - if r['class_ids'][i] == 17: - graphic_region = GraphicRegionType(custom='readingOrder {index:'+str(read_order)+';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) - # graphic_region.add_AlternativeImage(ai) + elif CLASS_NAMES[class_id] == 'graphics': + graphic_region = GraphicRegionType(**region_args) page.add_GraphicRegion(graphic_region) - continue - - textregion = TextRegionType(custom='readingOrder {index:'+str(read_order)+';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) - # textregion.add_AlternativeImage(ai) - - #border = page.get_Border() - # if border: - # border.add_TextRegion(textregion) - # else: - page.add_TextRegion(textregion) + else: + region_args['type_'] = CLASS_NAMES[class_id] + textregion = TextRegionType(**region_args) + page.add_TextRegion(textregion) @click.command() From a956f6302bb37a41b2b2c098d547062b659d3f06 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 12:06:58 +0100 Subject: [PATCH 14/23] block-segmentation: fix Border intersection --- .../cli/ocrd_anybaseocr_block_segmentation.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index d70e042..31faab8 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -130,7 +130,7 @@ def process(self): LOG.warning("Image already has text segments!") if oplevel == "page": - self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask_image) + self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask_image) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break @@ -146,7 +146,7 @@ def process(self): content=to_xml(pcgts).encode('utf-8') ) - def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask): + def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them border = None @@ -165,7 +165,6 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, # page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh) img_array = ocrolib.pil2array(page_image) - page_image.save('./checkthis.png') if len(img_array.shape) <= 2: img_array = np.stack((img_array,)*3, axis=-1) results = mrcnn_model.detect([img_array], verbose=0) @@ -173,7 +172,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, th = self.parameter['th'] # check for existing semgentation mask - # this code executes only when use_deeplr is set to True in ocrd-tool.json file + # this code executes only when the workflow had tiseg run before with use_deeplr=true if mask: mask = ocrolib.pil2array(mask) mask = mask//255 @@ -186,7 +185,6 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, max_x = r['rois'][i][2] max_y = r['rois'][i][3] mask[min_x:max_x, min_y:max_y] *= i+2 - cv2.imwrite('mask_check.png', mask*(255/(len(r['rois'])+2))) # check for left over pixels and add them to the bounding boxes pixel_added = True @@ -307,15 +305,6 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 - region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] - - if border: - cut_region_polygon = border.intersection(Polygon(region_polygon)) - if cut_region_polygon.is_empty: - continue - else: - cut_region_polygon = Polygon(region_polygon) - order_index = reading_order.index((min_y, min_x, max_y, max_x)) region_id = '%s_region%04d' % (page_id, i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_id) @@ -341,9 +330,10 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, # one change here to resolve flipped coordinates region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] - cut_region_polygon = border.intersection(Polygon(region_polygon)) - - if cut_region_polygon.is_empty: + cut_region_polygon = Polygon(region_polygon) + if border: + cut_region_polygon = border.intersection(cut_region_polygon) + if cut_region_polygon.is_empty or not cut_region_polygon.is_valid: continue cut_region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1]))][:-1] From 5a4d874a4caf62d960acca4622dd65ab56a58670 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 12:52:57 +0100 Subject: [PATCH 15/23] block-segmentation: fix TF logger init --- ocrd_anybaseocr/tensorflow_importer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd_anybaseocr/tensorflow_importer.py b/ocrd_anybaseocr/tensorflow_importer.py index 5edf4cf..ef1fc17 100644 --- a/ocrd_anybaseocr/tensorflow_importer.py +++ b/ocrd_anybaseocr/tensorflow_importer.py @@ -2,9 +2,8 @@ import os import warnings -from ocrd_utils import initLogging, getLogger -initLogging() -getLogger('tensorflow').setLevel('ERROR') +import logging +logging.getLogger('tensorflow').setLevel('ERROR') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # No prints from the tensorflow side warnings.filterwarnings('ignore', category=FutureWarning) #import tensorflow as tf From e641e31553ba16ac380bde1a44a79ea2dae47955 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 12:55:33 +0100 Subject: [PATCH 16/23] block-segmentation: remove buggy/useless AlternativeImage creation --- .../cli/ocrd_anybaseocr_block_segmentation.py | 30 ++----------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 31faab8..d3a012e 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -73,12 +73,8 @@ def __init__(self, confidence): IMAGES_PER_GPU = 1 NUM_CLASSES = len(CLASS_NAMES) -# NAME = "block" -# IMAGES_PER_GPU = 1 # NUM_CLASSES = 1 + 14 # DETECTION_MIN_CONFIDENCE = 0.9 # needs to be changed back to parameter - # DETECTION_MIN_CONFIDENCE = DETECTION_MIN_CONFIDENCE #taken as a parameter from tools.json - class OcrdAnybaseocrBlockSegmenter(Processor): @@ -102,15 +98,12 @@ def process(self): model_path = resource_filename(__name__, '../mrcnn') model_weights = Path(self.resolve_resource(self.parameter['block_segmentation_weights'])) - confidence = self.parameter['DETECTION_MIN_CONFIDENCE'] -# DETECTION_MIN_CONFIDENCE = Path(self.parameter['DETECTION_MIN_CONFIDENCE']) - + confidence = self.parameter['min_confidence'] config = InferenceConfig(confidence) # TODO: allow selecting active class IDs mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) mrcnn_model.load_weights(str(model_weights), by_name=True) - oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -129,11 +122,7 @@ def process(self): if regions: LOG.warning("Image already has text segments!") - if oplevel == "page": - self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask_image) - else: - LOG.warning('Operation level %s, but should be "page".', oplevel) - break + self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask_image) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -342,22 +331,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, region_polygon = coordinates_for_segment(cut_region_polygon, page_image, page_xywh) region_points = points_from_polygon(region_polygon) - read_order = reading_order.index((min_y, min_x, max_y, max_x)) - - # this can be tested, provided whether we need previous comments or not? - # resolving overlapping problem - - region_img = img_array[min_x:max_x, min_y:max_y] # extract from points and img_array - - region_img = ocrolib.array2pil(region_img) - - file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file(region_img, - file_id+"_"+str(i), - page_id=page_id, - file_grp=self.output_file_grp) - region_args = {'custom': 'readingOrder {index:'+str(read_order)+';}', 'id': '%s_region%04d' % (page_id, i), 'Coords': CoordsType(region_points)} From 046735df73c8afe253e2588a31120661c297f8a9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 13:21:26 +0100 Subject: [PATCH 17/23] block-segmentation: fix Border intersection (applies in absolute coords) --- .../cli/ocrd_anybaseocr_block_segmentation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index d3a012e..4bf374f 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -151,7 +151,6 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border = Polygon(border_points) -# page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh) img_array = ocrolib.pil2array(page_image) if len(img_array.shape) <= 2: @@ -319,17 +318,20 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, # one change here to resolve flipped coordinates region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] + # convert to absolute coordinates + region_polygon = coordinates_for_segment(region_polygon, page_image, page_xywh) + # intersect with parent and plausibilize cut_region_polygon = Polygon(region_polygon) if border: cut_region_polygon = border.intersection(cut_region_polygon) - if cut_region_polygon.is_empty or not cut_region_polygon.is_valid: + if cut_region_polygon.is_empty: + LOG.warning('region %d does not intersect page frame', i) continue - cut_region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), + if not cut_region_polygon.is_valid: + LOG.warning('region %d has invalid polygon', i) + continue + region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1]))][:-1] - - # checking whether coordinates are flipped - - region_polygon = coordinates_for_segment(cut_region_polygon, page_image, page_xywh) region_points = points_from_polygon(region_polygon) read_order = reading_order.index((min_y, min_x, max_y, max_x)) region_args = {'custom': 'readingOrder {index:'+str(read_order)+';}', From 0febe79604e66c3ab46edea9038d90a0b5e72413 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 13:23:15 +0100 Subject: [PATCH 18/23] block-segmentation: fix overwrite==false (continue by adding more) --- .../cli/ocrd_anybaseocr_block_segmentation.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 4bf374f..0556519 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -117,10 +117,6 @@ def process(self): mask_image, mask_xywh, mask_image_info = self.workspace.image_from_page(page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None - # Display Warning If image segment results already exist or not in StructMap? - regions = page.get_TextRegion() + page.get_TableRegion() - if regions: - LOG.warning("Image already has text segments!") self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask_image) @@ -138,25 +134,26 @@ def process(self): def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them - border = None - if page.get_TextRegion(): + if page.get_TextRegion() or page.get_TableRegion(): if self.parameter['overwrite']: - LOG.info('removing existing TextRegions in page "%s"', page_id) + LOG.info('removing existing text/table regions in page "%s"', page_id) page.set_TextRegion([]) else: - LOG.warning('keeping existing TextRegions in page "%s"', page_id) - return + LOG.warning('keeping existing text/table regions in page "%s"', page_id) # check if border exists + border = None if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border = Polygon(border_points) + LOG.info('detecting regions on page "%s"', page_id) img_array = ocrolib.pil2array(page_image) if len(img_array.shape) <= 2: img_array = np.stack((img_array,)*3, axis=-1) results = mrcnn_model.detect([img_array], verbose=0) r = results[0] + LOG.info('found %d regions on page "%s"', len(r['rois']), page_id) th = self.parameter['th'] # check for existing semgentation mask @@ -352,6 +349,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, region_args['type_'] = CLASS_NAMES[class_id] textregion = TextRegionType(**region_args) page.add_TextRegion(textregion) + LOG.info('added %s region on page "%s"', CLASS_NAMES[class_id], page_id) @click.command() From 3d54a19e4041b1116811917b7d89f530e9b6b5b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 13:23:37 +0100 Subject: [PATCH 19/23] block-segmentation: move model loading to setup() --- .../cli/ocrd_anybaseocr_block_segmentation.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 0556519..3621618 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -82,30 +82,33 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(OcrdAnybaseocrBlockSegmenter, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp') and hasattr(self, 'parameter'): + # processing context + self.setup() + + def setup(self): + LOG = getLogger('OcrdAnybaseocrBlockSegmenter') #self.reading_order = [] self.order = 0 + model_path = resource_filename(__name__, '../mrcnn') + model_weights = Path(self.resolve_resource(self.parameter['block_segmentation_weights'])) + confidence = self.parameter['min_confidence'] + config = InferenceConfig(confidence) + # TODO: allow selecting active class IDs + self.mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) + self.mrcnn_model.load_weights(str(model_weights), by_name=True) + def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('OcrdAnybaseocrBlockSegmenter') - if not tf.test.is_gpu_available(): LOG.warning("Tensorflow cannot detect CUDA installation. Running without GPU will be slow.") - model_path = resource_filename(__name__, '../mrcnn') - model_weights = Path(self.resolve_resource(self.parameter['block_segmentation_weights'])) - - confidence = self.parameter['min_confidence'] - config = InferenceConfig(confidence) - # TODO: allow selecting active class IDs - mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) - mrcnn_model.load_weights(str(model_weights), by_name=True) - - for (n, input_file) in enumerate(self.input_files): - + for input_file in self.input_files: pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() @@ -118,7 +121,7 @@ def process(self): except: mask_image = None - self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask_image) + self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -131,7 +134,7 @@ def process(self): content=to_xml(pcgts).encode('utf-8') ) - def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, mask): + def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them if page.get_TextRegion() or page.get_TableRegion(): @@ -151,7 +154,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, img_array = ocrolib.pil2array(page_image) if len(img_array.shape) <= 2: img_array = np.stack((img_array,)*3, axis=-1) - results = mrcnn_model.detect([img_array], verbose=0) + results = self.mrcnn_model.detect([img_array], verbose=0) r = results[0] LOG.info('found %d regions on page "%s"', len(r['rois']), page_id) From 681b70f3eb3141cc4c4a962c5c439d825021d43a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Feb 2021 13:53:58 +0100 Subject: [PATCH 20/23] block-segmentation: decode masks into polygons --- .../cli/ocrd_anybaseocr_block_segmentation.py | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 3621618..e1c5d1a 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -253,17 +253,17 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas # define reading order on basis of coordinates reading_order = [] - for i in range(len(r['rois'])): width, height, _ = img_array.shape - min_x = r['rois'][i][0] - min_y = r['rois'][i][1] - max_x = r['rois'][i][2] - max_y = r['rois'][i][3] + min_x, min_y, max_x, max_y = r['rois'][i] + class_id = r['class_ids'][i] + if class_id >= len(CLASS_NAMES): + raise Exception('Unexpected class id %d - model does not match' % class_id) + class_name = CLASS_NAMES[class_id] - if (min_y - 5) > width and r['class_ids'][i] == 2: + if (min_y - 5) > width and class_name == 'paragraph': min_y -= 5 - if (max_y + 10) < width and r['class_ids'][i] == 2: + if (max_y + 10) < width and class_name == 'paragraph': min_y += 10 reading_order.append((min_y, min_x, max_y, max_x)) @@ -282,41 +282,41 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) - - for i in range(len(r['rois'])): - min_x = r['rois'][i][0] - min_y = r['rois'][i][1] - max_x = r['rois'][i][2] - max_y = r['rois'][i][3] - if (min_y - 5) > width and r['class_ids'][i] == 2: - min_y -= 5 - if (max_y + 10) < width and r['class_ids'][i] == 2: - min_y += 10 - - order_index = reading_order.index((min_y, min_x, max_y, max_x)) - region_id = '%s_region%04d' % (page_id, i) - regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_id) - order_group.add_RegionRefIndexed(regionRefIndex) - reading_order_object = ReadingOrderType() reading_order_object.set_OrderedGroup(order_group) page.set_ReadingOrder(reading_order_object) for i in range(len(r['rois'])): width, height, _ = img_array.shape - min_x = r['rois'][i][0] - min_y = r['rois'][i][1] - max_x = r['rois'][i][2] - max_y = r['rois'][i][3] + min_x, min_y, max_x, max_y = r['rois'][i] class_id = r['class_ids'][i] + if class_id >= len(CLASS_NAMES): + raise Exception('Unexpected class id %d - model does not match' % class_id) + class_name = CLASS_NAMES[class_id] - if (min_y - 5) > width and class_id == 2: + if (min_y - 5) > width and class_name == 'paragraph': min_y -= 5 - if (max_y + 10) < width and class_id == 2: + if (max_y + 10) < width and class_name == 'paragraph': min_y += 10 - # one change here to resolve flipped coordinates - region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] + # estimate glyph scale (roughly) + mask = r['masks'][:,:,i] + area = np.count_nonzero(mask) + scale = int(np.sqrt(area)//10) + scale = scale + (scale+1)%2 # odd + + # dilate mask until we have a single outer contour + contours = [None, None] + for _ in range(10): + if len(contours) == 1: + break + mask = cv2.dilate(mask.astype(np.uint8), + np.ones((scale,scale), np.uint8)) > 0 + contours, _ = cv2.findContours(mask.astype(np.uint8), + cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + region_polygon = contours[0][:,0,:] # already in x,y order + #region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] # convert to absolute coordinates region_polygon = coordinates_for_segment(region_polygon, page_image, page_xywh) @@ -335,24 +335,25 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas region_points = points_from_polygon(region_polygon) read_order = reading_order.index((min_y, min_x, max_y, max_x)) region_args = {'custom': 'readingOrder {index:'+str(read_order)+';}', - 'id': '%s_region%04d' % (page_id, i), + 'id': 'region%04d' % i, 'Coords': CoordsType(region_points)} - if class_id >= len(CLASS_NAMES): - raise Exception('Unexpected class id %d - model does not match' % class_id) - if CLASS_NAMES[class_id] == 'image': + if class_name == 'image': image_region = ImageRegionType(**region_args) page.add_ImageRegion(image_region) - elif CLASS_NAMES[class_id] == 'table': + elif class_name == 'table': table_region = TableRegionType(**region_args) page.add_TableRegion(table_region) - elif CLASS_NAMES[class_id] == 'graphics': + elif class_name == 'graphics': graphic_region = GraphicRegionType(**region_args) page.add_GraphicRegion(graphic_region) else: - region_args['type_'] = CLASS_NAMES[class_id] + region_args['type_'] = class_name textregion = TextRegionType(**region_args) page.add_TextRegion(textregion) - LOG.info('added %s region on page "%s"', CLASS_NAMES[class_id], page_id) + order_index = reading_order.index((min_y, min_x, max_y, max_x)) + regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_args['id']) + order_group.add_RegionRefIndexed(regionRefIndex) + LOG.info('added %s region on page "%s"', class_name, page_id) @click.command() From a8137980c40e6f0200ee6c680bdf5f40764d1560 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 4 Feb 2021 05:05:33 +0100 Subject: [PATCH 21/23] =?UTF-8?q?block=20segmentation:=20post-processing,?= =?UTF-8?q?=20fix=20reading=20order=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - drop unused and dysfunctional code against overlaps - drop wrong reading order algorithm - improve mask post-processing (closing instead of dilation) - make mask-polygon conversion optional - add optional post-processing to reduce overlaps (bbox-only or mask-based): - non-maximum suppression across classes (min_iou_drop) - non-maximum merging across classes (min_iou_merge) - within-other suppression across classes (min_share_drop) - within-other merging across classes (min_share_merge) - implement correct reading order algorithm (bbox-only or mask-based): - partial order constraints under lr-tb assumption - topological sort - annotate confidence along with coordinate results --- .../cli/ocrd_anybaseocr_block_segmentation.py | 346 +++++++++++------- ocrd_anybaseocr/ocrd-tool.json | 73 +++- 2 files changed, 279 insertions(+), 140 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index e1c5d1a..1c5bbaf 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -39,10 +39,7 @@ from ..constants import OCRD_TOOL from ..tensorflow_importer import tf - TOOL = 'ocrd-anybaseocr-block-segmentation' -FALLBACK_IMAGE_GRP = 'OCR-D-IMG-BLOCK-SEGMENT' - CLASS_NAMES = ['BG', 'page-number', 'paragraph', @@ -87,7 +84,7 @@ def __init__(self, *args, **kwargs): self.setup() def setup(self): - LOG = getLogger('OcrdAnybaseocrBlockSegmenter') + LOG = getLogger('processor.AnybaseocrBlockSegmenter') #self.reading_order = [] self.order = 0 model_path = resource_filename(__name__, '../mrcnn') @@ -100,11 +97,11 @@ def setup(self): self.mrcnn_model.load_weights(str(model_weights), by_name=True) def process(self): - + """Segment pages into regions using a Mask R-CNN model.""" assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - LOG = getLogger('OcrdAnybaseocrBlockSegmenter') + LOG = getLogger('processor.AnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning("Tensorflow cannot detect CUDA installation. Running without GPU will be slow.") @@ -114,14 +111,22 @@ def process(self): page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID + # todo rs: why not cropped? page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: - mask_image, mask_xywh, mask_image_info = self.workspace.image_from_page(page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') + # todo rs: this combination only works for tiseg with use_deeplr=true + mask_image, _, _ = self.workspace.image_from_page(page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None - self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image) + self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image, dpi) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -134,8 +139,8 @@ def process(self): content=to_xml(pcgts).encode('utf-8') ) - def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mask): - LOG = getLogger('OcrdAnybaseocrBlockSegmenter') + def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mask, dpi): + LOG = getLogger('processor.AnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them if page.get_TextRegion() or page.get_TableRegion(): if self.parameter['overwrite']: @@ -144,11 +149,11 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas else: LOG.warning('keeping existing text/table regions in page "%s"', page_id) # check if border exists - border = None + border_polygon = None if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) - border = Polygon(border_points) + border_polygon = Polygon(border_points) LOG.info('detecting regions on page "%s"', page_id) img_array = ocrolib.pil2array(page_image) @@ -156,7 +161,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas img_array = np.stack((img_array,)*3, axis=-1) results = self.mrcnn_model.detect([img_array], verbose=0) r = results[0] - LOG.info('found %d regions on page "%s"', len(r['rois']), page_id) + LOG.info('found %d candidates on page "%s"', len(r['rois']), page_id) th = self.parameter['th'] # check for existing semgentation mask @@ -168,11 +173,8 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas # multiply all the bounding box part with 2 for i in range(len(r['rois'])): - min_x = r['rois'][i][0] - min_y = r['rois'][i][1] - max_x = r['rois'][i][2] - max_y = r['rois'][i][3] - mask[min_x:max_x, min_y:max_y] *= i+2 + min_y, min_x, max_y, max_x = r['rois'][i] + mask[min_y:max_y, min_x:max_x] *= i+2 # check for left over pixels and add them to the bounding boxes pixel_added = True @@ -181,8 +183,8 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas pixel_added = False left_over = np.where(mask == 1) - for x, y in zip(left_over[0], left_over[1]): - local_mask = mask[x-th:x+th, y-th:y+th] + for y, x in zip(left_over[0], left_over[1]): + local_mask = mask[y-th:y+th, x-th:x+th] candidates = np.where(local_mask > 1) candidates = [k for k in zip(candidates[0], candidates[1])] if len(candidates) > 0: @@ -192,93 +194,189 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas index = local_mask[candidates[0]]-2 # add pixel to mask/bbox - # x,y to bbox with index - if x < r['rois'][index][0]: - r['rois'][index][0] = x + # y,x to bbox with index + if y < r['rois'][index][0]: + r['rois'][index][0] = y - elif x > r['rois'][index][2]: - r['rois'][index][2] = x + elif y > r['rois'][index][2]: + r['rois'][index][2] = y - if y < r['rois'][index][1]: - r['rois'][index][1] = y + if x < r['rois'][index][1]: + r['rois'][index][1] = x - elif y > r['rois'][index][3]: - r['rois'][index][3] = y + elif x > r['rois'][index][3]: + r['rois'][index][3] = x # update the mask - mask[x, y] = index + 2 - - # resolving overlapping problem - bbox_dict = {} # to check any overlapping bbox - class_id_check = [] - - for i in range(len(r['rois'])): - min_x = r['rois'][i][0] - min_y = r['rois'][i][1] - max_x = r['rois'][i][2] - max_y = r['rois'][i][3] - - region_bbox = [min_y, min_x, max_y, max_x] - - for key in bbox_dict: - for bbox in bbox_dict[key]: + mask[y, x] = index + 2 - # checking for ymax case with vertical overlapping - # along with y, check both for xmax and xmin - if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1] and - ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or - (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or - (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and - r['class_ids'][i] != 5): - - r['rois'][i][2] = bbox[1] - 1 - - # checking for ymin now - # along with y, check both for xmax and xmin - if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1] and - ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or - (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or - (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and - r['class_ids'][i] != 5): - - r['rois'][i][0] = bbox[3] + 1 - - if r['class_ids'][i] not in class_id_check: - bbox_dict[r['class_ids'][i]] = [] - class_id_check.append(r['class_ids'][i]) - - bbox_dict[r['class_ids'][i]].append(region_bbox) - - # resolving overlapping problem code - - # define reading order on basis of coordinates - reading_order = [] for i in range(len(r['rois'])): - width, height, _ = img_array.shape - min_x, min_y, max_x, max_y = r['rois'][i] class_id = r['class_ids'][i] if class_id >= len(CLASS_NAMES): raise Exception('Unexpected class id %d - model does not match' % class_id) - class_name = CLASS_NAMES[class_id] - if (min_y - 5) > width and class_name == 'paragraph': - min_y -= 5 - if (max_y + 10) < width and class_name == 'paragraph': - min_y += 10 - reading_order.append((min_y, min_x, max_y, max_x)) - - reading_order = sorted(reading_order, key=lambda reading_order: (reading_order[1], reading_order[0])) - for i in range(len(reading_order)): - min_y, min_x, max_y, max_x = reading_order[i] - min_y = 0 - i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) - for j in range(i+1, len(reading_order)): - min_y, min_x, max_y, max_x = reading_order[j] - j_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) - inter = i_poly.intersection(j_poly) - if inter: - reading_order.insert(j+1, reading_order[i]) - del reading_order[i] + # find hull contours on masks + if self.parameter['use_masks']: + r.setdefault('polygons', list()) + # estimate glyph scale (roughly) + scale = int(dpi / 6) + scale = scale + (scale+1)%2 # odd + for i in range(len(r['rois'])): + mask = r['masks'][:,:,i] + mask = cv2.dilate(mask.astype(np.uint8), + np.ones((scale,scale), np.uint8)) > 0 + # close mask until we have a single outer contour + contours = None + for _ in range(10): + mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, + np.ones((scale,scale), np.uint8)) > 0 + contours, _ = cv2.findContours(mask.astype(np.uint8), + cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + if len(contours) == 1: + break + r['polygons'].append(Polygon(contours[0][:,0,:])) # already in x,y order + + # to reduce overlaps, apply IoU-based non-maximum suppression + # (and other post-processing against overlaps) across classes, + # but not on the raw pixels, but the smoothed hull polygons + LOG.info('post-processing detections on page "%s"', page_id) + worse = [] + if self.parameter['post_process']: + active = True + def _merge_rois(i, j): + """merges i into j""" + nonlocal r, active + r['rois'][j][0] = min(r['rois'][i][0], r['rois'][j][0]) + r['rois'][j][1] = min(r['rois'][i][1], r['rois'][j][1]) + r['rois'][j][2] = max(r['rois'][i][2], r['rois'][j][2]) + r['rois'][j][3] = max(r['rois'][i][3], r['rois'][j][3]) + r['polygons'][j] = r['polygons'][i].union(r['polygons'][j]) + #r['scores'][j] = max(r['scores'][i], r['scores'][i]) + active = True + # find overlapping pairs + while active: + active = False + for i in range(len(r["class_ids"])): + if i in worse: + continue + for j in range(i + 1, len(r['class_ids'])): + if j in worse: + continue + iclass = r['class_ids'][i] + jclass = r['class_ids'][j] + iname = CLASS_NAMES[iclass] + jname = CLASS_NAMES[jclass] + if (iname == 'drop-capital') != (jname == 'drop-capital'): + # ignore drop-capital overlapping with others + continue + # rs todo: lower priority for footnote? + if (r['rois'][i][1] > r['rois'][j][3] or + r['rois'][i][3] < r['rois'][j][1] or + r['rois'][i][0] > r['rois'][j][2] or + r['rois'][i][2] < r['rois'][j][0]): + # no overlap (cut) + continue + iscore = r['scores'][i] + jscore = r['scores'][j] + if not self.parameter['use_masks']: + LOG.debug("roi %d[%s] overlaps roi %d[%s] and %s (replacing)", + i, iname, j, jname, + "looses" if iscore < jscore else "wins") + if iscore < jscore: + worse.append(i) + break + else: + worse.append(j) + continue + # compare masks + ipoly = r['polygons'][i] + jpoly = r['polygons'][j] + isize = ipoly.area + jsize = jpoly.area + inter = ipoly.intersection(jpoly).area + union = ipoly.union(jpoly).area + # LOG.debug("%d/%d %dpx/%dpx shared %dpx overall %dpx", + # i, j, isize, jsize, inter, union) + if inter / isize > self.parameter['min_share_drop']: + LOG.debug("roi %d[%s] contains roi %d[%s] (replacing)", + j, jname, i, iname) + worse.append(i) + break + elif inter / jsize > self.parameter['min_share_drop']: + LOG.debug("roi %d[%s] contains roi %d[%s] (replacing)", + i, iname, j, jname) + worse.append(j) + elif inter / union > self.parameter['min_iou_drop']: + LOG.debug("roi %d[%s] heavily overlaps roi %d[%s] and %s (replacing)", + i, iname, j, jname, + "looses" if iscore < jscore else "wins") + if iscore < jscore: + worse.append(i) + break + else: + worse.append(j) + elif inter / isize > self.parameter['min_share_merge']: + LOG.debug("roi %d[%s] covers roi %d[%s] (merging)", + j, jname, i, iname) + worse.append(i) + _merge_rois(i, j) + break + elif inter / jsize > self.parameter['min_share_merge']: + LOG.debug("roi %d[%s] covers roi %d[%s] (merging)", + i, iname, j, jname) + worse.append(j) + _merge_rois(j, i) + elif inter / union > self.parameter['min_iou_merge']: + LOG.debug("roi %d[%s] slightly overlaps roi %d[%s] and %s (merging)", + i, iname, j, jname, + "looses" if iscore < jscore else "wins") + if iscore < jscore: + worse.append(i) + _merge_rois(i, j) + break + else: + worse.append(j) + _merge_rois(j, i) + + # define reading order on basis of coordinates + partial_order = np.zeros((len(r['rois']), len(r['rois'])), np.uint8) + for i, (min_y_i, min_x_i, max_y_i, max_x_i) in enumerate(r['rois']): + for j, (min_y_j, min_x_j, max_y_j, max_x_j) in enumerate(r['rois']): + if min_x_i < max_x_j and max_x_i > min_x_j: + # xoverlaps + if min_y_i < min_y_j: + partial_order[i, j] = 1 + else: + min_y = min(min_y_i, min_y_j) + max_y = max(max_y_i, max_y_j) + min_x = min(min_x_i, min_x_j) + max_x = max(max_x_i, max_x_j) + if next((False for (min_y_k, min_x_k, max_y_k, max_x_k) in r['rois'] + if (min_y_k < max_y and max_y_k > min_y and + min_x_k < max_x and max_x_k > min_x)), + True): + # no k in between + if ((min_y_j + max_y_j)/2 < min_y_i and + (min_y_i + max_y_i)/2 > max_y_j): + # vertically unrelated + partial_order[j, i] = 1 + elif max_x_i < min_x_j: + partial_order[i, j] = 1 + def _topsort(po): + visited = np.zeros(po.shape[0], np.bool) + result = list() + def _visit(k): + if visited[k]: + return + visited[k] = True + for l in np.nonzero(po[:, k])[0]: + _visit(l) + result.append(k) + for k in range(po.shape[0]): + _visit(k) + return result + reading_order = _topsort(partial_order) # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) @@ -288,55 +386,43 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas for i in range(len(r['rois'])): width, height, _ = img_array.shape - min_x, min_y, max_x, max_y = r['rois'][i] + min_y, min_x, max_y, max_x = r['rois'][i] + score = r['scores'][i] class_id = r['class_ids'][i] - if class_id >= len(CLASS_NAMES): - raise Exception('Unexpected class id %d - model does not match' % class_id) class_name = CLASS_NAMES[class_id] + if i in worse: + LOG.debug("Ignoring instance %d[%s] overlapping better/larger neighbour", + i, class_name) + continue - if (min_y - 5) > width and class_name == 'paragraph': - min_y -= 5 - if (max_y + 10) < width and class_name == 'paragraph': - min_y += 10 - - # estimate glyph scale (roughly) - mask = r['masks'][:,:,i] - area = np.count_nonzero(mask) - scale = int(np.sqrt(area)//10) - scale = scale + (scale+1)%2 # odd - - # dilate mask until we have a single outer contour - contours = [None, None] - for _ in range(10): - if len(contours) == 1: - break - mask = cv2.dilate(mask.astype(np.uint8), - np.ones((scale,scale), np.uint8)) > 0 - contours, _ = cv2.findContours(mask.astype(np.uint8), - cv2.RETR_EXTERNAL, - cv2.CHAIN_APPROX_SIMPLE) - region_polygon = contours[0][:,0,:] # already in x,y order - #region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] + if self.parameter['use_masks']: + region_polygon = r['polygons'][i].exterior.coords[:-1] + else: + region_polygon = polygon_from_bbox( + max(min_x - 5, 0) if class_name == 'paragraph' else min_x, + min_y, + min(max_x + 10, width) if class_name == 'paragraph' else max_x, + max_y) # convert to absolute coordinates region_polygon = coordinates_for_segment(region_polygon, page_image, page_xywh) # intersect with parent and plausibilize cut_region_polygon = Polygon(region_polygon) - if border: - cut_region_polygon = border.intersection(cut_region_polygon) + if border_polygon: + cut_region_polygon = border_polygon.intersection(cut_region_polygon) if cut_region_polygon.is_empty: LOG.warning('region %d does not intersect page frame', i) continue if not cut_region_polygon.is_valid: LOG.warning('region %d has invalid polygon', i) continue - region_polygon = [j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), - list(cut_region_polygon.exterior.coords.xy[1]))][:-1] - region_points = points_from_polygon(region_polygon) - read_order = reading_order.index((min_y, min_x, max_y, max_x)) + region_polygon = cut_region_polygon.exterior.coords[:-1] + region_coords = CoordsType(points_from_polygon(region_polygon), + conf=score) + read_order = reading_order.index(i) region_args = {'custom': 'readingOrder {index:'+str(read_order)+';}', 'id': 'region%04d' % i, - 'Coords': CoordsType(region_points)} + 'Coords': region_coords} if class_name == 'image': image_region = ImageRegionType(**region_args) page.add_ImageRegion(image_region) @@ -350,7 +436,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas region_args['type_'] = class_name textregion = TextRegionType(**region_args) page.add_TextRegion(textregion) - order_index = reading_order.index((min_y, min_x, max_y, max_x)) + order_index = reading_order.index(i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_args['id']) order_group.add_RegionRefIndexed(regionRefIndex) LOG.info('added %s region on page "%s"', class_name, page_id) diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index 73680ef..62025c9 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -88,7 +88,7 @@ "input_file_grp": ["OCR-D-IMG-CROP"], "output_file_grp": ["OCR-D-SEG-TISEG"], "categories": ["Layout analysis"], - "steps": ["layout/segmentation/text-image"], + "steps": ["layout/segmentation/text-nontext"], "description": "Separates the text and non-text elements with anyBaseOCR. Outputs clipped versions of the input image as AlternativeImage containing either only text or non-text elements.", "parameters": { "use_deeplr": { @@ -141,7 +141,7 @@ "input_file_grp": ["OCR-D-IMG-CROP"], "output_file_grp": ["OCR-D-SEG-LAYOUT"], "categories": ["Layout analysis"], - "steps": ["layout/segmentation/text-image"], + "steps": ["layout/analysis"], "description": "Generates a table-of-content like document structure of the whole document.", "parameters": { "batch_size": {"type": "number", "format": "integer", "default": 4, "description": "Batch size for generating test images"}, @@ -152,16 +152,69 @@ "ocrd-anybaseocr-block-segmentation": { "executable": "ocrd-anybaseocr-block-segmentation", "input_file_grp": ["OCR-D-IMG"], - "output_file_grp": ["OCR-D-BLOCK-SEGMENT"], + "output_file_grp": ["OCR-D-SEG-BLOCK"], "categories": ["Layout analysis"], - "steps": ["layout/segmentation/text-image"], - "description": "Segments and classifies document segments in a single page and outputs the the region polygons and classes.", + "steps": ["layout/segmentation/region"], + "description": "Segments and classifies regions in each single page and annotates the the region polygons and classes.", "parameters": { - "block_segmentation_weights": { "type": "string","default":"block_segmentation_weights.h5", "required": false, "description": "Path to model weights"}, - "operation_level": {"type": "string", "enum": ["page"], "default": "page","description": "PAGE XML hierarchy level to operate on"}, - "overwrite": {"type": "boolean", "default": false, "description": "check whether to overwrite existing text lines"}, - "th" : {"type": "integer", "default": 15, "description": "num of pixels to include in the area region"}, - "DETECTION_MIN_CONFIDENCE" : {"type": "number", "default": 0.9, "description": "Confidence value for a model to detect bounding box"} + "block_segmentation_weights": { + "type": "string", + "format":"uri", + "content-type": "application/x-hdf;subtype=bag", + "cacheable": true, + "default":"block_segmentation_weights.h5", + "description": "Path to model weights" + }, + "overwrite": { + "type": "boolean", + "default": false, + "description": "whether to delete existing text lines prior to segmentation" + }, + "th": { + "type": "integer", + "default": 15, + "description": "num of pixels to include in the area region (when applying text/non-text mask from tiseg)" + }, + "post_process": { + "type": "boolean", + "default": true, + "description": "whether to apply non-maximum suppression (across classes) on the detections" + }, + "use_masks": { + "type": "boolean", + "default": true, + "description": "whether to segment from the mask as polygon instead of just the bbox" + }, + "min_confidence": { + "type": "number", + "format": "float", + "default": 0.9, + "description": "Confidence threshold for region detections" + }, + "min_share_drop": { + "type": "number", + "format": "float", + "default": 0.9, + "description": "Minimum required overlap (intersection over single) of mask-derived contour area between neighbours to suppress smaller prediction" + }, + "min_share_merge": { + "type": "number", + "format": "float", + "default": 0.8, + "description": "Minimum required overlap (intersection over single) of mask-derived contour area between neighbours to merge smaller prediction" + }, + "min_iou_drop": { + "type": "number", + "format": "float", + "default": 0.8, + "description": "Minimum required overlap (intersection over union) of mask-derived contour area between neighbours to suppress prediction scoring worse" + }, + "min_iou_merge": { + "type": "number", + "format": "float", + "default": 0.2, + "description": "Minimum required overlap (intersection over union) of mask-derived contour area between neighbours to merge prediction scoring worse" + } } } } From 8c3db3721cf183461edac300cd3949be46614695 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 4 Feb 2021 11:39:53 +0100 Subject: [PATCH 22/23] block segmentation: restrict active classes (default suppresses footnote/header etc) --- .../cli/ocrd_anybaseocr_block_segmentation.py | 6 ++- ocrd_anybaseocr/mrcnn/model.py | 40 ++++++++++++++----- ocrd_anybaseocr/ocrd-tool.json | 9 +++++ 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py index 1c5bbaf..6558269 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py @@ -92,7 +92,6 @@ def setup(self): confidence = self.parameter['min_confidence'] config = InferenceConfig(confidence) - # TODO: allow selecting active class IDs self.mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) self.mrcnn_model.load_weights(str(model_weights), by_name=True) @@ -159,7 +158,10 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mas img_array = ocrolib.pil2array(page_image) if len(img_array.shape) <= 2: img_array = np.stack((img_array,)*3, axis=-1) - results = self.mrcnn_model.detect([img_array], verbose=0) + # convert to incidence matrix + class_ids = np.array([[1 if category in self.parameter['active_classes'] else 0 + for category in CLASS_NAMES]], dtype=np.int32) + results = self.mrcnn_model.detect([img_array], verbose=0, active_class_ids=class_ids) r = results[0] LOG.info('found %d candidates on page "%s"', len(r['rois']), page_id) diff --git a/ocrd_anybaseocr/mrcnn/model.py b/ocrd_anybaseocr/mrcnn/model.py index f1ed968..e0b3aec 100644 --- a/ocrd_anybaseocr/mrcnn/model.py +++ b/ocrd_anybaseocr/mrcnn/model.py @@ -685,7 +685,7 @@ def compute_mask(self, inputs, mask=None): # Detection Layer ############################################################ -def refine_detections_graph(rois, probs, deltas, window, config): +def refine_detections_graph(rois, probs, deltas, window, active_class_ids, config): """Refine classified proposals and filter overlaps and return final detections. @@ -696,10 +696,16 @@ def refine_detections_graph(rois, probs, deltas, window, config): bounding box deltas. window: (y1, x1, y2, x2) in normalized coordinates. The part of the image that contains the image excluding the padding. + active_class_ids: [num_classes]. Has a value of 1 for classes + that are allowed in the dataset of the image, and 0 for classes + that are not allowed in the dataset. Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where coordinates are normalized. """ + # Suppress scores for inactive classes + probs = tf.where(tf.cast(K.tile(K.expand_dims(active_class_ids, 0), (probs.shape[0],1)), tf.bool), + x=probs, y=K.zeros_like(probs)) # Class IDs per ROI class_ids = tf.argmax(probs, axis=1, output_type=tf.int32) # Class probability of the top class of each ROI @@ -809,11 +815,12 @@ def call(self, inputs): m = parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = norm_boxes_graph(m['window'], image_shape[:2]) + active_class_ids = m['active_class_ids'] # Run detection refinement graph on each item in the batch detections_batch = utils.batch_slice( - [rois, mrcnn_class, mrcnn_bbox, window], - lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config), + [rois, mrcnn_class, mrcnn_bbox, window, active_class_ids], + lambda r, p, d, w, c: refine_detections_graph(r, p, d, w, c, self.config), self.config.IMAGES_PER_GPU) # Reshape output @@ -1275,7 +1282,7 @@ def hook(images, augmenter, parents, default): # Active classes # Different datasets have different classes, so track the # classes supported in the dataset of this image. - active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32) + active_class_ids = np.zeros([config.NUM_CLASSES], dtype=np.int32) source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]] active_class_ids[source_class_ids] = 1 @@ -2379,11 +2386,13 @@ def train(self, train_dataset, val_dataset, learning_rate, epochs, layers, ) self.epoch = max(self.epoch, epochs) - def mold_inputs(self, images): + def mold_inputs(self, images, active_class_ids=None): """Takes a list of images and modifies them to the format expected as an input to the neural network. images: List of image matrices [height,width,depth]. Images can have different sizes. + active_class_ids: List of class_ids allowed for the given images. Or + boolean matrix [images, classes]. Returns 3 Numpy matrices: molded_images: [N, h, w, 3]. Images resized and normalized. @@ -2394,7 +2403,18 @@ def mold_inputs(self, images): molded_images = [] image_metas = [] windows = [] - for image in images: + if isinstance(active_class_ids, np.ndarray): + assert active_class_ids.shape == (len(images), self.config.NUM_CLASSES), \ + "active_class_ids dimensions must match number of images and classes" + active_classes = active_class_ids + elif active_class_ids: + active_classes = np.zeros([self.config.NUM_CLASSES], dtype=np.int32) + active_classes[active_class_ids] = 1 + active_classes = np.tile(active_classes, (len(images), 1)) + else: + active_classes = np.ones([self.config.NUM_CLASSES], dtype=np.int32) + active_classes = np.tile(active_classes, (len(images), 1)) + for i, image in enumerate(images): # Resize image # TODO: move resizing to mold_image() molded_image, window, scale, padding, crop = utils.resize_image( @@ -2407,7 +2427,7 @@ def mold_inputs(self, images): # Build image_meta image_meta = compose_image_meta( 0, image.shape, molded_image.shape, window, scale, - np.zeros([self.config.NUM_CLASSES], dtype=np.int32)) + active_class_ids[i]) # Append molded_images.append(molded_image) windows.append(window) @@ -2483,10 +2503,12 @@ def unmold_detections(self, detections, mrcnn_mask, original_image_shape, return boxes, class_ids, scores, full_masks - def detect(self, images, verbose=0): + def detect(self, images, verbose=0, active_class_ids=None): """Runs the detection pipeline. images: List of images, potentially of different sizes. + active_class_ids: List of class_ids allowed for the given images. Or + Boolean matrix [images, classes]. Returns a list of dicts, one dict per image. The dict contains: rois: [N, (y1, x1, y2, x2)] detection bounding boxes @@ -2504,7 +2526,7 @@ def detect(self, images, verbose=0): log("image", image) # Mold inputs to format expected by the neural network - molded_images, image_metas, windows = self.mold_inputs(images) + molded_images, image_metas, windows = self.mold_inputs(images, active_class_ids) # Validate image sizes # All images in a batch MUST be of the same size diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index 62025c9..1112e25 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -175,6 +175,15 @@ "default": 15, "description": "num of pixels to include in the area region (when applying text/non-text mask from tiseg)" }, + "active_classes": { + "type": "array", + "items": { + "type": "string", + "enum": ["page-number", "paragraph", "catch-word", "heading", "drop-capital", "signature-mark", "header", "marginalia", "footnote", "footnote-continued", "caption", "endnote", "footer", "keynote", "image", "table", "graphics"] + }, + "default": ["page-number", "paragraph", "catch-word", "heading", "drop-capital", "signature-mark", "marginalia", "caption"], + "description": "Restrict types of regions to be detected." + }, "post_process": { "type": "boolean", "default": true, From 06e75efeff34a2bf289d40258bc1fad04260aaf9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 19 May 2021 13:35:25 +0200 Subject: [PATCH 23/23] tiseg: import keras from tensorflow not directly, OCR-D/ocrd_all#256 --- ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py index 4b2d044..1e340e2 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_tiseg.py @@ -20,6 +20,7 @@ import numpy as np import shapely import ocrolib +from ..tensorflow_importer import keras from keras.models import load_model #from keras_segmentation.models.unet import resnet50_unet from ocrd import Processor