From 2778e4a14cf9d11fdbaf230173431c35d38ec470 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Feb 2022 03:40:04 +0100 Subject: [PATCH 01/17] Makefile: fix test dependencies; update to resmgr cwd semantics --- Makefile | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index d85d320..98a7732 100755 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ export CUDA_VISIBLE_DEVICES=0 SHELL = /bin/bash -PYTHON = python -PIP = pip +PYTHON ?= python +PIP ?= pip PIP_INSTALL = $(PIP) install LOG_LEVEL = INFO PYTHONIOENCODING=utf8 @@ -19,6 +19,7 @@ DOCKER_TAG = ocrd/anybaseocr # BEGIN-EVAL makefile-parser --make-help Makefile +.PHONY: help help: @echo "" @echo " Targets" @@ -46,6 +47,7 @@ help: # END-EVAL # Install python deps via pip +.PHONY: deps deps: $(PIP_INSTALL) -r requirements.txt @@ -81,6 +83,7 @@ models: ocrd resmgr download --allow-uninstalled --location cwd ocrd-anybaseocr-layout-analysis '*' ocrd resmgr download --allow-uninstalled --location cwd ocrd-anybaseocr-tiseg '*' +.PHONY: docker docker: docker build -t '$(DOCKER_TAG)' . @@ -90,51 +93,60 @@ repo/assets: git clone https://github.com/OCR-D/assets "$@" # Remove assets +.PHONY: assets-clean assets-clean: rm -rf $(testdir)/assets # Setup test assets -assets: repo/assets +.PHONY: assets +assets: repo/assets models mkdir -p $(testdir)/assets cp -r -t $(testdir)/assets repo/assets/data/* - $(MAKE) models - ln -sr ocrd-resources/* $(TESTDATA)/ # # Tests # # Run unit tests +.PHONY: test test: assets-clean assets $(PYTHON) -m pytest --continue-on-collection-errors $(TESTS) # Run CLI tests -cli-test: assets-clean assets \ - test-binarize test-deskew test-crop test-tiseg test-textline test-layout-analysis +.PHONY: cli-test +cli-test: assets-clean assets +cli-test: test-binarize test-deskew test-crop test-tiseg test-textline test-layout-analysis # Test binarization CLI -test-binarize: +.PHONY: test-binarize +test-binarize: assets ocrd-anybaseocr-binarize -m $(TESTDATA)/mets.xml -I MAX -O BIN-TEST # Test deskewing CLI -test-deskew: +.PHONY: test-deskew +test-deskew: test-binarize ocrd-anybaseocr-deskew -m $(TESTDATA)/mets.xml -I BIN-TEST -O DESKEW-TEST # Test cropping CLI -test-crop: +.PHONY: test-crop +test-crop: test-deskew ocrd-anybaseocr-crop -m $(TESTDATA)/mets.xml -I DESKEW-TEST -O CROP-TEST # Test text/non-text segmentation CLI -test-tiseg: +.PHONY: test-tiseg +test-tiseg: test-crop ocrd-anybaseocr-tiseg -m $(TESTDATA)/mets.xml --overwrite -I CROP-TEST -O TISEG-TEST # Test block segmentation CLI -test-block-segmentation: +.PHONY: test-block-segmentation +test-block-segmentation: test-tiseg ocrd-anybaseocr-block-segmentation -m $(TESTDATA)/mets.xml -I TISEG-TEST -O OCR-D-BLOCK-SEGMENT # Test textline extraction CLI -test-textline: +.PHONY: test-textline +test-textline: test-tiseg ocrd-anybaseocr-textline -m $(TESTDATA)/mets.xml -I TISEG-TEST -O TL-TEST # Test document structure analysis CLI -test-layout-analysis: +.PHONY: test-layout-analysis +test-layout-analysis: test-binarize ocrd-anybaseocr-layout-analysis -m $(TESTDATA)/mets.xml -I BIN-TEST -O LAYOUT From 9c7e242fcea6f397beea9fb8eeb16e265d9caf58 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Feb 2022 03:41:51 +0100 Subject: [PATCH 02/17] =?UTF-8?q?layout-analysis:=20improve=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - move model loading into `setup` in constructor context - allow directories as models (TF SavedModel format), too - use correct pageId - simplify and polish --- .../cli/ocrd_anybaseocr_layout_analysis.py | 85 ++++++++----------- 1 file changed, 35 insertions(+), 50 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py index 87bc9e2..8fb6e89 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py @@ -62,36 +62,47 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(OcrdAnybaseocrLayoutAnalyser, self).__init__(*args, **kwargs) - - def create_model(self, path ):#model_name='inception_v3', def_weights=True, num_classes=34, input_size=(600, 500, 1)): - - ''' - path: string containing path to model definition - ''' - model = load_model(path) - return model + if hasattr(self, 'output_file_grp') and hasattr(self, 'parameter'): + # processing context + self.setup() - def start_test(self, model, img_array, filename, labels): - + def setup(self): + LOG = getLogger('OcrdAnybaseocrLayoutAnalyser') + model_path = Path(self.resolve_resource(self.parameter['model_path'])) + class_mapper_path = Path(self.resolve_resource(self.parameter['class_mapping_path'])) + if not model_path.exists(): + LOG.critical("Layout classfication `model_path` was not found at '%s'", model_path) + sys.exit(1) + LOG.info('Loading model from file %s', str(model_path)) + self.model = self.create_model(str(model_path)) + # load the mapping + pickle_in = open(str(class_mapper_path), "rb") + class_indices = pickle.load(pickle_in) + self.label_mapping = dict((v,k) for k,v in class_indices.items()) + + def create_model(self, path): + #model_name='inception_v3', def_weights=True, num_classes=34, input_size=(600, 500, 1)): + '''load Tensorflow model from path''' + return load_model(path) + + def predict(self, img_array): # shape should be 1,600,500 for keras - pred = model.predict(img_array) + pred = self.model.predict(img_array) pred = np.array(pred) # multi-label predictions if len(pred.shape)>2: pred = np.squeeze(pred) pred = pred.T - preds = (pred>=0.5) predictions = [] for index, cls in enumerate(preds): if cls: - predictions.append(labels[index]) + predictions.append(self.label_mapping[index]) if len(predictions) == 0: # if no prediction get the maximum one - predictions.append(labels[np.argmax(pred)]) + predictions.append(self.label_mapping[np.argmax(pred)]) #predictions.append('page') # default label - return predictions def img_resize(self, image_path): @@ -207,49 +218,23 @@ def process(self): LOG = getLogger('OcrdAnybaseocrLayoutAnalyser') if not tf.test.is_gpu_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") - # sys.exit(1) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - model_path = Path(self.resolve_resource(self.parameter['model_path'])) - class_mapper_path = Path(self.resolve_resource(self.parameter['class_mapping_path'])) - if not Path(model_path).is_file(): - LOG.error("""\ - Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter - points to the local model path. - model can be downloaded from http://url - """ % model_path) - sys.exit(1) - else: - - LOG.info('Loading model from file %s', model_path) - model = self.create_model(str(model_path)) - # load the mapping - pickle_in = open(str(class_mapper_path), "rb") - class_indices = pickle.load(pickle_in) - label_mapping = dict((v,k) for k,v in class_indices.items()) - - # print("INPUT FILE HERE",self.input_files) - for (n, input_file) in enumerate(self.input_files): - pcgts = page_from_file(self.workspace.download_file(input_file)) - fname = pcgts.get_Page().imageFilename + + for input_file in self.input_files: page_id = input_file.pageId or input_file.ID - size = 600, 500 - - self.add_metadata(pcgts) + pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() - LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) - - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector='binarized') - - + LOG.info("INPUT FILE %s", page_id) + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id, feature_selector='binarized') img_array = ocrolib.pil2array(page_image.resize((500, 600), Image.ANTIALIAS)) - img_array = img_array * 1./255. + img_array = img_array / 255 img_array = img_array[np.newaxis, :, :, np.newaxis] - results = self.start_test(model, img_array, fname, label_mapping) + results = self.predict(img_array) LOG.info(results) - self.workspace.mets.set_physical_page_for_file("PHYS_000" + str(n) , input_file) + #self.workspace.mets.set_physical_page_for_file(input_file.pageId, input_file) self.create_logmap_smlink(pcgts) - self.write_to_mets(results, "PHYS_000" + str(n)) + self.write_to_mets(results, input_file.pageId) @click.command() @ocrd_cli_options From a7d3b3cd802f65f83f169079fb5fac413cc3ce7c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Feb 2022 03:47:18 +0100 Subject: [PATCH 03/17] ocrd-tool (tiseg/layout-analysis): use SavedFormat instead of HDF5 by default --- ocrd_anybaseocr/ocrd-tool.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index b53acef..1cd747d 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_anybaseocr", - "version": "1.6.0", + "version": "1.6.1", "tools": { "ocrd-anybaseocr-binarize": { "executable": "ocrd-anybaseocr-binarize", @@ -127,10 +127,10 @@ "seg_weights": { "type":"string", "format":"uri", - "content-type": "application/x-hdf;subtype=bag", + "content-type": "text/directory", "cacheable": true, - "default":"seg_model.hdf5", - "description":"Path to weights file for deep learning model when use_deeplr is true." + "default":"seg_model", + "description":"Directory path to deep learning model when use_deeplr is true." } } }, @@ -173,8 +173,8 @@ "description": "Generates a table-of-content like document structure of the whole document.", "parameters": { "batch_size": {"type": "number", "format": "integer", "default": 4, "description": "Batch size for generating test images"}, - "model_path": { "type": "string", "default":"structure_analysis.h5", "required": false, "description": "Path to Layout Structure Classification Model"}, - "class_mapping_path": { "type": "string", "default":"mapping_densenet.pickle","required": false, "description": "Path to Layout Structure Classes"} + "model_path": { "type": "string", "format": "uri", "content-type": "text/directory", "cacheable": true, "default":"structure_analysis", "description": "Directory path to layout structure classification model"}, + "class_mapping_path": { "type": "string", "format": "uri", "content-type": "application/python-pickle", "cacheable": true, "default":"mapping_densenet.pickle", "description": "File path to layout structure classes"} } }, "ocrd-anybaseocr-block-segmentation": { @@ -252,7 +252,7 @@ "default": 0.2, "description": "Minimum required overlap (intersection over union) of mask-derived contour area between neighbours to merge prediction scoring worse" } - } + } } } } From 457439715d76a13cd4e2c84d4772395ee3690b63 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Feb 2022 03:48:01 +0100 Subject: [PATCH 04/17] update requirements --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index d28820a..2dc3a92 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -keras >= 2.3.0, < 2.4.0 -keras-preprocessing == 1.1.0 +keras +keras-preprocessing numpy >= 1.15.4 -ocrd >= 2.22.3 +ocrd >= 2.30 ocrd-fork-pylsd >= 0.0.4 -ocrd-fork-ocropy >= 1.4.0a3 # Python3 ocrolib +ocrd-fork-ocropy >= 1.4.0a4 # Python3 ocrolib opencv-python-headless >= 3.4 pandas scikit-image >= 0.17.2 -scipy == 1.4.1 +scipy >= 1.4.1 setuptools >= 41.0.0 shapely -tensorflow >= 2.1.0, < 2.2.0 +tensorflow torch>=1.1.0 torchvision >= 0.6.1 From cee07d6b06428e5baed208ae34c8f5ff16884430 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 15:22:07 +0100 Subject: [PATCH 05/17] add test for dewarping --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 98a7732..8606420 100755 --- a/Makefile +++ b/Makefile @@ -37,8 +37,9 @@ help: @echo " test-crop Test cropping CLI" @echo " test-tiseg Test text/non-text segmentation CLI" @echo " test-block-segmentation Test block segmentation CLI" - @echo " test-textline Test textline extraction CLI" + @echo " test-textline Test textline segmentation CLI" @echo " test-layout-analysis Test document structure analysis CLI" + @echo " test-dewarp Test page dewarping CLI" @echo "" @echo " Variables" @echo "" @@ -114,7 +115,7 @@ test: assets-clean assets # Run CLI tests .PHONY: cli-test cli-test: assets-clean assets -cli-test: test-binarize test-deskew test-crop test-tiseg test-textline test-layout-analysis +cli-test: test-binarize test-deskew test-crop test-tiseg test-textline test-layout-analysis test-dewarp # Test binarization CLI .PHONY: test-binarize @@ -141,11 +142,16 @@ test-tiseg: test-crop test-block-segmentation: test-tiseg ocrd-anybaseocr-block-segmentation -m $(TESTDATA)/mets.xml -I TISEG-TEST -O OCR-D-BLOCK-SEGMENT -# Test textline extraction CLI +# Test textline segmentation CLI .PHONY: test-textline test-textline: test-tiseg ocrd-anybaseocr-textline -m $(TESTDATA)/mets.xml -I TISEG-TEST -O TL-TEST +# Test page dewarping CLI +.PHONY: test-dewarp +test-dewarp: test-crop + ocrd-anybaseocr-dewarp -m $(TESTDATA)/mets.xml -I CROP-TEST -O DEWARP-TEST + # Test document structure analysis CLI .PHONY: test-layout-analysis test-layout-analysis: test-binarize From f8db5f7f11e9296124456d271d43f563c5cc993a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 19:49:12 +0100 Subject: [PATCH 06/17] tests: fix relative import --- setup.py | 2 +- tests/__init__.py | 0 tests/test_crop.py | 2 +- tests/test_dewarp.py | 4 ++-- tests/test_smoke.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 tests/__init__.py diff --git a/setup.py b/setup.py index 1a4f90b..06a15c3 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ long_description=open('README.md').read(), long_description_content_type='text/markdown', install_requires=open('requirements.txt').read().split('\n'), - packages=find_packages(exclude=["work_dir", "src"]), + packages=find_packages(exclude=["work_dir", "src", "tests"]), package_data={ '': ['*.json'] }, diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_crop.py b/tests/test_crop.py index 87aa974..c6801ba 100644 --- a/tests/test_crop.py +++ b/tests/test_crop.py @@ -7,7 +7,7 @@ from ocrd_anybaseocr.cli.ocrd_anybaseocr_cropping import OcrdAnybaseocrCropper -from tests.base import TestCase, assets, main, copy_of_directory +from .base import TestCase, assets, main, copy_of_directory class AnyocrCropperTest(TestCase): diff --git a/tests/test_dewarp.py b/tests/test_dewarp.py index 62e4803..e43bed5 100644 --- a/tests/test_dewarp.py +++ b/tests/test_dewarp.py @@ -7,9 +7,9 @@ import torch import pytest -from ocrd_anybaseocr.cli.ocrd_anybaseocr_dewarp import OcrdAnybaseocrDewarper # FIXME srsly y +from ocrd_anybaseocr.cli.ocrd_anybaseocr_dewarp import OcrdAnybaseocrDewarper -from tests.base import TestCase, assets, main, copy_of_directory +from .base import TestCase, assets, main, copy_of_directory class AnyocrDewarperTest(TestCase): diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 7fa378a..4d8e917 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1,6 +1,6 @@ import json -from tests.base import main, CapturingTestCase as TestCase +from .base import main, CapturingTestCase as TestCase from ocrd_anybaseocr.cli.ocrd_anybaseocr_binarize import cli as OcrdAnybaseocrBinarizer from ocrd_anybaseocr.cli.ocrd_anybaseocr_block_segmentation import cli as OcrdAnybaseocrBlockSegmenter From b853a63cdcf88befe5bfa26e3218fb22b6a62da9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 19:50:39 +0100 Subject: [PATCH 07/17] tests: fix initLogging --- tests/base.py | 2 +- tests/test_dewarp.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/base.py b/tests/base.py index 2f5a546..95a3043 100644 --- a/tests/base.py +++ b/tests/base.py @@ -27,7 +27,7 @@ class TestCase(VanillaTestCase): def setUpClass(cls): chdir(dirname(realpath(__file__)) + '/..') - def tearDown(self): + def setUp(self): initLogging() class CapturingTestCase(TestCase): diff --git a/tests/test_dewarp.py b/tests/test_dewarp.py index e43bed5..392ebcb 100644 --- a/tests/test_dewarp.py +++ b/tests/test_dewarp.py @@ -3,7 +3,7 @@ from ocrd import Resolver, Workspace from ocrd.processor.base import run_processor -from ocrd_utils import MIMETYPE_PAGE +from ocrd_utils import MIMETYPE_PAGE, initLogging import torch import pytest @@ -14,8 +14,9 @@ class AnyocrDewarperTest(TestCase): def setUp(self): - self.model_path = Path(Path.cwd(), 'models/latest_net_G.pth') + self.model_path = Path(Path.cwd(), 'latest_net_G.pth') self.resolver = Resolver() + initLogging() def test_crop(self): if not torch.cuda.is_available(): From adc2f3e769651e98ed649ca56e83a9158d7d6634 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 19:51:37 +0100 Subject: [PATCH 08/17] test_dewarp: mets.find_files is a generator now --- tests/test_dewarp.py | 4 ++-- tests/test_smoke.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_dewarp.py b/tests/test_dewarp.py index 392ebcb..3b38fed 100644 --- a/tests/test_dewarp.py +++ b/tests/test_dewarp.py @@ -23,7 +23,7 @@ def test_crop(self): pytest.skip('CUDA is not available, cannot test dewarping') with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) - pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) + pagexml_before = len(ws.mets.find_all_files(mimetype=MIMETYPE_PAGE)) run_processor( OcrdAnybaseocrDewarper, resolver=self.resolver, @@ -33,7 +33,7 @@ def test_crop(self): parameter={'model_path': str(self.model_path)} ) ws.reload_mets() - pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) + pagexml_after = len(ws.mets.find_all_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1) if __name__ == "__main__": diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 4d8e917..be7b495 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -31,7 +31,7 @@ def test_all_help(self): for cli in CLIS: exit_code, out, err = self.invoke_cli(cli, ['--help']) self.assertIn('--input-file-grp', out) - self.assertEquals(exit_code, 0) + self.assertEqual(exit_code, 0) def test_all_json(self): """ From 7ca33b5ebf1ca29a0c6fd447a543dca508cbeb05 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 19:58:51 +0100 Subject: [PATCH 09/17] =?UTF-8?q?dewarping:=20fix=20prepare=5Fdata?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit use custom dataset class for in-memory PIL.Image passing instead of file-based repurposed `AlignedDataset` (since (this is faster, and reliable: OCR-D does not guarantee us a `.filename` for derived images; also, does not create temporary files in the input fileGrp anymore) --- ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py | 60 +++++++++++-------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py index 19adef3..9d72856 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py @@ -4,14 +4,15 @@ import sys import os +from pathlib import Path +from PIL import Image +import click +import torch +import numpy as np -from ..constants import OCRD_TOOL - +import ocrolib from ocrd import Processor from ocrd_models.ocrd_page import to_xml, AlternativeImageType - -import click - from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import ( getLogger, @@ -20,35 +21,42 @@ make_file_id ) from ocrd_modelfactory import page_from_file -from pylab import array -from pathlib import Path from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -import torch -import ocrolib - +from ..constants import OCRD_TOOL from ..pix2pixhd.options.test_options import TestOptions from ..pix2pixhd.models.models import create_model -from ..pix2pixhd.data.data_loader import CreateDataLoader +from ..pix2pixhd.data.base_dataset import BaseDataset, get_params, get_transform +from ..pix2pixhd.util.util import tensor2im TOOL = 'ocrd-anybaseocr-dewarp' -def prepare_data(opt, page_img): +class TestDataset(BaseDataset): + # adopted from pix2pixhd.data.AlignDataset for our TestOptions + # but with in-memory Image + def __init__(self, opt, images): + super().__init__() + self.opt = opt + self.images = images + def __getitem__(self, index): + image = self.images[index] + param = get_params(self.opt, image.size) + trans = get_transform(self.opt, param) + tensor = trans(image.convert('RGB')) + return {'label': tensor, 'path': '', + 'inst': 0, 'image': 0, 'feat': 0} + def __len__(self): + return len(self.images) // self.opt.batchSize * self.opt.batchSize - # XXX this needs to be created or the CreateDataLoader(opt) call will fail - Path(opt.dataroot, 'test_A').mkdir() - data_loader = CreateDataLoader(opt) - print(dir(page_img)) - data_loader.dataset.A_paths = [page_img.filename] - data_loader.dataset.dataset_size = len(data_loader.dataset.A_paths) - data_loader.dataloader = torch.utils.data.DataLoader(data_loader.dataset, - batch_size=opt.batchSize, - shuffle=not opt.serial_batches, - num_workers=int(opt.nThreads)) - dataset = data_loader.load_data() - return dataset - -def prepare_options(gpu_id, dataroot, model_path, resize_or_crop, loadSize, fineSize): +def prepare_data(opt, page_img): + # todo: make asynchronous (all pages for continuous quasi-parallel decoding) + dataset = TestDataset(opt, [page_img]) + return torch.utils.data.DataLoader(dataset, + batch_size=opt.batchSize, + shuffle=not opt.serial_batches, + num_workers=int(opt.nThreads)) + +def prepare_options(gpu_id, model_path, resize_or_crop, loadSize, fineSize): LOG = getLogger('OcrdAnybaseocrDewarper') # XXX https://github.com/OCR-D/ocrd_anybaseocr/pull/62#discussion_r450232164 # The problem was with how BaseOptions.parse is implemented in pix2pixHD based on From 5ba789077520ce8e357ce9f03c4c9a40f6dc260b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 20:04:39 +0100 Subject: [PATCH 10/17] =?UTF-8?q?dewarping:=20fix=20image=20post-processin?= =?UTF-8?q?g=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit after decoding, convert tensor to array with due respect for proper channel and dynamic range coding (instead of ad-hoc conversion); then resize while still in RGB and re-binarize (instead of ad-hoc binarization followed by resizing in binary) --- ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py index 9d72856..a51e755 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py @@ -175,16 +175,20 @@ def process(self): def _process_segment(self, model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n): for _, data in enumerate(dataset): w, h = orig_img_size - generated = model.inference( - data['label'], data['inst'], data['image']) - dewarped = array(generated.data[0].permute(1, 2, 0).detach().cpu()) - bin_array = array(255*(dewarped > ocrolib.midrange(dewarped)), 'B') - dewarped = ocrolib.array2pil(bin_array) - dewarped = dewarped.resize((w, h)) - - page_xywh['features'] += ',dewarped' - - file_id = make_file_id(input_file, self.output_file_grp) + '-IMG' + generated = self.model.inference(data['label'], data['inst'], data['image']) + #dewarped = generated.data[0].permute(1, 2, 0).detach().cpu().numpy() + ## convert RGB float to uint8 (clipping negative) + #dewarped = Image.fromarray(np.array(np.maximum(0, dewarped) * 255, dtype=np.uint8)) + # zzz: strictly, we should try to invert the dataset's input transform here + dewarped = Image.fromarray(tensor2im(generated.data[0])) + # resize using high-quality interpolation + dewarped = dewarped.resize((w, h), Image.BICUBIC) + # re-binarize + dewarped = np.array(dewarped) + dewarped = np.mean(dewarped, axis=2) > ocrolib.midrange(dewarped) + dewarped = Image.fromarray(dewarped) + coords['features'] += ',dewarped' + file_id = make_file_id(input_file, self.output_file_grp) + '.IMG-DEW' file_path = self.workspace.save_image_file(dewarped, file_id, page_id=input_file.pageId, From 5ca9f9647a8bf438bca6a6bf463ba8534c012e94 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 20:13:29 +0100 Subject: [PATCH 11/17] =?UTF-8?q?dewarping:=20improve=20model=20loading=20?= =?UTF-8?q?and=20rename=20params=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rebase on pix2pixHD#293 (CPU-only option, Torch>=1.0, less verbose, arg passing) - pass args to pix2pixHD directly (instead of sys.args hijacking) - no unneccesary verbosity (and only through loggers) - move model loading into startup context via `setup` fn - rename params: * `imgresize` → `resize_mode`, * `resizeHeight` → `resize_height` * `resizeWidth` → `resize_width` - add proper documentation - fix region-level results --- ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py | 133 +++++++++--------- ocrd_anybaseocr/ocrd-tool.json | 44 +++++- ocrd_anybaseocr/pix2pixhd | 2 +- 3 files changed, 109 insertions(+), 70 deletions(-) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py index a51e755..f693858 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_dewarp.py @@ -58,36 +58,28 @@ def prepare_data(opt, page_img): def prepare_options(gpu_id, model_path, resize_or_crop, loadSize, fineSize): LOG = getLogger('OcrdAnybaseocrDewarper') - # XXX https://github.com/OCR-D/ocrd_anybaseocr/pull/62#discussion_r450232164 - # The problem was with how BaseOptions.parse is implemented in pix2pixHD based on - # argparse. I cannot explain why but the approach to let pix2pixHD fill the - # TestOptions instance with argparse default values and then modifying the - # instance did not work, the overrides were simply ignored. The only way I got - # pix2pixHD to reliably pick up the overrides was this sys.argv approach. It's - # ugly, true, but so is using argparse as an API. At least this way, it is - # uniform as you say. - sys.argv = ['python'] - sys.argv.extend(['--gpu_ids', str(gpu_id)]) - sys.argv.extend(['--nThreads', str(1)]) # test code only supports nThreads = 1 - sys.argv.extend(['--batchSize', str(1)]) # test code only supports batchSize = 1 - sys.argv.extend(['--serial_batches']) # no shuffle - sys.argv.extend(['--no_flip']) # no flip - sys.argv.extend(['--dataroot', dataroot]) - sys.argv.extend(['--checkpoints_dir', str(model_path.parents[1])]) - sys.argv.extend(['--name', model_path.parents[0].name]) - sys.argv.extend(['--label_nc', str(0)]) - sys.argv.extend(['--no_instance']) - sys.argv.extend(['--resize_or_crop', resize_or_crop]) - sys.argv.extend(['--n_blocks_global', str(10)]) - sys.argv.extend(['--n_local_enhancers', str(2)]) - sys.argv.extend(['--loadSize', str(loadSize)]) - sys.argv.extend(['--fineSize', str(fineSize)]) - sys.argv.extend(['--model', 'pix2pixHD']) - sys.argv.extend(['--verbose']) - LOG.debug("Options passed to pix2pixHD: %s", sys.argv) + # we cannot use TestOptions instances directly, because its parse() + # does some nontrivial postprocessing (which we do not want to redo here) + args = [] + args.extend(['--gpu_ids', str(gpu_id)]) + args.extend(['--nThreads', str(1)]) # test code only supports nThreads = 1 + args.extend(['--batchSize', str(1)]) # test code only supports batchSize = 1 + args.extend(['--serial_batches']) # no shuffle + args.extend(['--no_flip']) # no flip + args.extend(['--checkpoints_dir', str(model_path.parents[1])]) + args.extend(['--name', model_path.parents[0].name]) + args.extend(['--label_nc', str(0)]) # number of input label channels (just RGB if zero) + args.extend(['--no_instance']) # no instance maps as input + args.extend(['--resize_or_crop', resize_or_crop]) + args.extend(['--n_blocks_global', str(10)]) + args.extend(['--n_local_enhancers', str(2)]) + args.extend(['--loadSize', str(loadSize)]) + args.extend(['--fineSize', str(fineSize)]) + args.extend(['--model', 'pix2pixHD']) + #args.extend(['--verbose']) + LOG.debug("Options passed to pix2pixHD: %s", args) opt = TestOptions() - opt.initialize() - opt = opt.parse(save=False) + opt = opt.parse(args=args, save=False, silent=True) model = create_model(opt) @@ -99,36 +91,56 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(OcrdAnybaseocrDewarper, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp') and hasattr(self, 'parameter'): + # processing context + self.setup() - - def process(self): + def setup(self): LOG = getLogger('OcrdAnybaseocrDewarper') - - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - if self.parameter['gpu_id'] > -1 and not torch.cuda.is_available(): LOG.warning("torch cannot detect CUDA installation.") self.parameter['gpu_id'] = -1 model_path = Path(self.resolve_resource(self.parameter['model_path'])) if not model_path.is_file(): - LOG.error("""\ - pix2pixHD model file was not found at '%s'. Make sure this file exists. - """ % model_path) + LOG.error("pix2pixHD model file was not found at '%s'", model_path) sys.exit(1) - - opt, model = prepare_options( + self.opt, self.model = prepare_options( gpu_id=self.parameter['gpu_id'], - dataroot=str(Path(self.workspace.directory, self.input_file_grp)), model_path=model_path, - resize_or_crop=self.parameter['imgresize'], - loadSize=self.parameter['resizeHeight'], - fineSize=self.parameter['resizeWidth'], + resize_or_crop=self.parameter['resize_mode'], + loadSize=self.parameter['resize_height'], + fineSize=self.parameter['resize_width'], ) + def process(self): + """Dewarp pages of the workspace via pix2pixHD (conditional GANs) + + Open and deserialise each PAGE input file and its respective image, + then iterate over its segment hierarchy down to the requested + ``operation_level``. + + Next, get the binarized image according to the layout annotation + (from the alternative image of the segment, or by cropping and + deskewing from the parent image as annotated). + + Then pass the image to the preloaded pix2pixHD model for inference. + (It will be resized and/or cropped according to ``resize_width``, + ``resize_height`` and ``resize_mode`` prior to decoding, and the + result will be resized to match the original.) + + After decoding, add the new image file to the output fileGrp for + the same pageId (using a file ID with suffix ``.IMG-DEW``). + Reference the new image file in the AlternativeImage of the segment. + + Produce a new output file by serialising the resulting hierarchy. + """ + LOG = getLogger('OcrdAnybaseocrDewarper') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + oplevel = self.parameter['operation_level'] - for (n, input_file) in enumerate(self.input_files): + for input_file in self.input_files: page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s", page_id) @@ -136,30 +148,25 @@ def process(self): self.add_metadata(pcgts) page = pcgts.get_Page() - try: - page_image, page_xywh, _ = self.workspace.image_from_page( - page, page_id, feature_filter='dewarped', feature_selector='binarized') # images should be deskewed and cropped - except Exception: - page_image, page_xywh, _ = self.workspace.image_from_page( - page, page_id, feature_filter='dewarped') # images should be deskewed and cropped + page_image, page_xywh, _ = self.workspace.image_from_page( + page, page_id, + # images SHOULD be deskewed and cropped, and MUST be binarized + feature_filter='dewarped', feature_selector='binarized') if oplevel == 'page': - dataset = prepare_data(opt, page_image) - orig_img_size = page_image.size self._process_segment( - model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n) + prepare_data(self.opt, page_image), page, page_xywh, page_image.size, input_file) else: regions = page.get_TextRegion() + page.get_TableRegion() # get all regions? if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for _, region in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) + region, page_image, page_xywh, + # images SHOULD be deskewed and cropped, and MUST be binarized + feature_filter='dewarped', feature_selector='binarized') # TODO: not tested on regions - # TODO: region has to exist as a physical file to be processed by pix2pixHD - dataset = prepare_data(opt, region_image) - orig_img_size = region_image.size self._process_segment( - model, dataset, page, region_xywh, region.id, input_file, orig_img_size, n) + prepare_data(self.opt, region_image), region, region_xywh, region_image.size, input_file) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) @@ -172,7 +179,7 @@ def process(self): content=to_xml(pcgts).encode('utf-8') ) - def _process_segment(self, model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n): + def _process_segment(self, dataset, segment, coords, orig_img_size, input_file): for _, data in enumerate(dataset): w, h = orig_img_size generated = self.model.inference(data['label'], data['inst'], data['image']) @@ -193,9 +200,9 @@ def _process_segment(self, model, dataset, page, page_xywh, page_id, input_file, file_id, page_id=input_file.pageId, file_grp=self.output_file_grp, - ) - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=page_xywh['features'])) + ) + segment.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments=coords['features'])) @click.command() @ocrd_cli_options diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index 1cd747d..2f468a7 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -103,12 +103,44 @@ "input_file_grp": ["OCR-D-IMG-CROP"], "output_file_grp": ["OCR-D-IMG-DEWARP"], "parameters": { - "imgresize": { "type": "string", "default": "resize_and_crop", "description": "run on original size image"}, - "model_path": { "type": "string", "default": "latest_net_G.pth", "description": "Path to the trained pix2pixHD model", "cacheable": true, "content-type": "application/vnd.pytorch"}, - "gpu_id": { "type": "number", "format": "integer", "default": -1, "description": "device ID of CUDA GPU to use. Set -1 to use CPU only."}, - "resizeHeight": { "type": "number", "format": "integer", "default": 1024, "description": "resized image height"}, - "resizeWidth": { "type": "number", "format": "integer", "default": 1024, "description": "resized image width"}, - "operation_level": {"type": "string", "enum": ["page","region"], "default": "page","description": "PAGE XML hierarchy level to operate on (should match what model was trained on!)"} + "resize_mode": { + "type": "string", + "enum": ["resize_and_crop", "crop", "scale_width", "scale_width_and_crop", "none"], + "default": "resize_and_crop", + "description": "transformation to apply to the original image before input to the network" + }, + "resize_height": { + "type": "number", + "format": "integer", + "default": 1024, + "description": "target image height before input to the network" + }, + "resize_width": { + "type": "number", + "format": "integer", + "default": 1024, + "description": "target image width before input to the network" + }, + "model_path": { + "type": "string", + "format": "uri", + "default": "latest_net_G.pth", + "description": "Path to the trained pix2pixHD model", + "cacheable": true, + "content-type": "application/vnd.pytorch" + }, + "gpu_id": { + "type": "number", + "format": "integer", + "default": -1, + "description": "CUDA device ID of GPU to use, or -1 for CPU only" + }, + "operation_level": { + "type": "string", + "enum": ["page", "region"], + "default": "page", + "description": "PAGE XML hierarchy level to operate on (should match what model was trained on!)" + } } }, "ocrd-anybaseocr-tiseg": { diff --git a/ocrd_anybaseocr/pix2pixhd b/ocrd_anybaseocr/pix2pixhd index 5a2c872..e524de2 160000 --- a/ocrd_anybaseocr/pix2pixhd +++ b/ocrd_anybaseocr/pix2pixhd @@ -1 +1 @@ -Subproject commit 5a2c87201c5957e2bf51d79b8acddb9cc1920b26 +Subproject commit e524de235b251adddee6ca2bcbd31115a834077c From 3575f9c0bb7cb649c59081fd0b891d5ec0e381ab Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 20:18:58 +0100 Subject: [PATCH 12/17] test_dewarp: also when on CPU, use CROP as input (just BIN is not enough / not as good / not realistic) --- tests/test_dewarp.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_dewarp.py b/tests/test_dewarp.py index 3b38fed..49bb808 100644 --- a/tests/test_dewarp.py +++ b/tests/test_dewarp.py @@ -18,9 +18,7 @@ def setUp(self): self.resolver = Resolver() initLogging() - def test_crop(self): - if not torch.cuda.is_available(): - pytest.skip('CUDA is not available, cannot test dewarping') + def test_dewarp(self): with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_all_files(mimetype=MIMETYPE_PAGE)) @@ -28,7 +26,7 @@ def test_crop(self): OcrdAnybaseocrDewarper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), - input_file_grp='BIN', + input_file_grp='CROP', output_file_grp='DEWARP-TEST', parameter={'model_path': str(self.model_path)} ) From e1e84bb21c33ea7836306f3b306b47fbce26b91d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 20:20:13 +0100 Subject: [PATCH 13/17] :package: 1.7.0 --- ocrd_anybaseocr/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_anybaseocr/ocrd-tool.json b/ocrd_anybaseocr/ocrd-tool.json index 2f468a7..210679e 100755 --- a/ocrd_anybaseocr/ocrd-tool.json +++ b/ocrd_anybaseocr/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_anybaseocr", - "version": "1.6.1", + "version": "1.7.0", "tools": { "ocrd-anybaseocr-binarize": { "executable": "ocrd-anybaseocr-binarize", From 3cfc69b3d4823cf340d256c97983f5d0c31f52a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 22:48:45 +0100 Subject: [PATCH 14/17] fix/update README --- README.md | 212 ++++++++++++++++++++++++------------------------------ 1 file changed, 95 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index bcac381..ff294e6 100755 --- a/README.md +++ b/README.md @@ -2,182 +2,160 @@ [![CircleCI](https://circleci.com/gh/OCR-D/ocrd_anybaseocr.svg?style=svg)](https://circleci.com/gh/OCR-D/ocrd_anybaseocr) -> Tools for preprocessing scanned images for OCR +> Tools to preprocess and segment scanned images for OCR-D + + * [Installing](#installing) + * [Tools](#tools) + * [Binarizer](#binarizer) + * [Deskewer](#deskewer) + * [Cropper](#cropper) + * [Dewarper](#dewarper) + * [Text/Non-Text Segmenter](#textnon-text-segmenter) + * [Block Segmenter](#block-segmenter) + * [Textline Segmenter](#textline-segmenter) + * [Document Analyser](#document-analyser) + * [Testing](#testing) + * [License](#license) # Installing -- 1. Create a new `venv` unless you already have one +1. Create a new `venv` unless you already have one - $ python3 -m venv venv + python3 -m venv venv -* 2. Activate the `venv` +2. Activate the `venv` - $ source venv/bin/activate + source venv/bin/activate -* 3. Install with `make` +3. Install with `make` - $ make install + make install -#Tools +# Tools + +All tools, also called _processors_, abide by the [CLI specifications]((https://ocr-d.de/en/spec/cli)) for [OCR-D](https://ocr-d.de), which roughly looks like: + + ocrd- [-m ] -I -O [-p ]* [-P ]* ## Binarizer ### Method Behaviour - This function takes a scanned colored /gray scale document image as input and do the black and white binarize image. +For each page (or sub-segment), this processor takes a scanned colored / gray scale document image as input and computes a binarized (black and white) image. + +Implemented via rule-based methods (percentile based adaptive background estimation in Ocrolib). - #### Usage: -```sh -ocrd-anybaseocr-binarize -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +### Example + + ocrd-anybaseocr-binarize -I OCR-D-IMG -O OCR-D-BIN -P operation_level line -P threshold 0.3 -#### Example: -```sh -ocrd-anybaseocr-binarize \ - -m mets.xml \ - -I OCR-D-IMG \ - -O OCR-D-PAGE-BIN -``` ## Deskewer ### Method Behaviour - This function takes a document image as input and do the skew correction of that document. The input images have to be binarized for this module to work. +For each page (or sub-segment), this processor takes a document image as input and computes the skew angle of that. It also annotates a deskewed image. + +The input images have to be binarized for this module to work. + +Implemented via rule-based methods (binary projection profile entropy maximization in Ocrolib). - #### Usage: -```sh -ocrd-anybaseocr-deskew -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +### Example -#### Example: -```sh -ocrd-anybaseocr-deskew \ - -m mets.xml \ - -I OCR-D-PAGE-BIN \ - -O OCR-D-PAGE-DESKEW -``` + ocrd-anybaseocr-deskew -I OCR-D-BIN -O OCR-D-DESKEW -P maxskew 5.0 -P skewsteps 20 -P operation_level page ## Cropper ### Method Behaviour - This function takes a document image as input and crops/selects the page content area only (that's mean remove textual noise as well as any other noise around page content area). The input image need not be binarized but should be deskewed for the module to work optimally. - - #### Usage: -```sh -ocrd-anybaseocr-crop -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +For each page, this processor takes a document image as input and computes the border around the page content area (i.e. removes textual noise as well as any other noise around the page frame). It also annotates a cropped image. -#### Example: -```sh -ocrd-anybaseocr-crop \ - -m mets.xml \ - -I OCR-D-PAGE-DESKEW \ - -O OCR-D-PAGE-CROP -``` +The input image need not be binarized, but should be deskewed for the module to work optimally. + +Implemented via rule-based methods (gradient-based line segment detection and morphology based textline detection). + +### Example: + ocrd-anybaseocr-crop -I OCR-D-DESKEW -O OCR-D-CROP -P rulerAreaMax 0 -P marginLeft 0.1 ## Dewarper ### Method Behaviour - This function takes a document image as input and make the text line straight if its curved. The input image has to be binarized for the module to work. - - #### Usage: -```sh -ocrd-anybaseocr-dewarp -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +For each page, this processor takes a document image as input and computes a morphed image which will make the text lines straight if they are curved. +The input image has to be binarized for the module to work, and should be cropped and deskewed for optimal quality. -#### Example: -```sh -CUDA_VISIBLE_DEVICES=0 ocrd-anybaseocr-dewarp \ - -m mets.xml \ - -I OCR-D-PAGE-CROP \ - -O OCR-D-PAGE-DEWARP -``` +Implemented via data-driven methods (neural GAN conditional image model trained with pix2pixHD/Pytorch). + +### Example + + ocrd-anybaseocr-dewarp -I OCR-D-CROP -O OCR-D-DEWARP -P resize_mode none -P gpu_id -1 ## Text/Non-Text Segmenter ### Method Behaviour - This function takes a document image as an input and separates the text and non-text part from the input document image. - The module outputs 2 AlternativeImages instead of document regions, which are clipped (binarized) versions of the input image, containing either only text or only non-text components. +For each page, this processor takes a document image as an input and computes two images, separating the text and non-text parts. + +The input image has to be binarized for the module to work, and should be cropped and deskewed for optimal quality. + +Implemented via data-driven methods (neural pixel classifier model trained with Tensorflow/Keras). - #### Usage: -```sh -ocrd-anybaseocr-tiseg -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +### Example -#### Example: -```sh -ocrd-anybaseocr-tiseg \ - -m mets.xml \ - -I OCR-D-PAGE-CROP \ - -O OCR-D-PAGE-TISEG -``` + ocrd-anybaseocr-tiseg -I OCR-D-DEWARP -O OCR-D-TISEG -P use_deeplr true -## Textline Segmenter +## Block Segmenter ### Method Behaviour - This function takes a cropped document image as an input and segment the image into textline images. The input image should be binarized and deskewed for the module to work. +For each page, this processor takes the raw document image as an input and computes a text region segmentation for it (distinguishing various types of text blocks). + +The input image need not be binarized, but should be deskewed for the module to work optimally. + +Implemented via data-driven methods (neural Mask-RCNN instance segmentation model trained with Tensorflow/Keras). - #### Usage: -```sh -ocrd-anybaseocr-textline -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +### Example -#### Example: -```sh -ocrd-anybaseocr-textline \ - -m mets.xml \ - -I OCR-D-PAGE-TISEG \ - -O OCR-D-PAGE-TL -``` + ocrd-anybaseocr-block-segmenter -I OCR-D-TISEG -O OCR-D-BLOCK -P active_classes '["page-number", "paragraph", "heading", "drop-capital", "marginalia", "caption"]' -P min_confidence 0.8 -P post_process true - + ocrd-anybaseocr-textline -I OCR-D-BLOCK -O OCR-D-LINE -P operation_level region ## Document Analyser ### Method Behaviour - This function takes all the cropped document images of a single book and its corresponding text regions as input and generates the logical structure on the book level. The input image should be binarized for this module to work. - - #### Usage: -```sh -ocrd-anybaseocr-layout-analysis -m (path to METs input file) -I (Input group name) -O (Output group name) [-p (path to parameter file) -o (METs output filename)] -``` +For the whole document, this processor takes all the cropped page images and their corresponding text regions as input and computes the logical structure (page types and sections). -#### Example: -```sh -ocrd-anybaseocr-layout-analysis \ - -m mets.xml \ - -I OCR-IMG \ - -O OCR-D-PAGE-BLOCK -``` +The input image should be binarized and segmented for this module to work. + +### Example + ocrd-anybaseocr-layout-analysis -I OCR-D-LINE -O OCR-D-STRUCT ## Testing -To test the tools, download [OCR-D/assets](https://github.com/OCR-D/assets). In -particular, the code is tested with the -[dfki-testdata](https://github.com/OCR-D/assets/tree/master/data/dfki-testdata) +To test the tools under realistic conditions (on OCR-D workspaces), +download [OCR-D/assets](https://github.com/OCR-D/assets). In particular, +the code is tested with the [dfki-testdata](https://github.com/OCR-D/assets/tree/master/data/dfki-testdata) dataset. -Run `make test` to run all tests. +To download the data: + + make assets + +To run module tests: + + make test + +To run processor/workflow tests: + + make cli-test ## License @@ -194,4 +172,4 @@ Run `make test` to run all tests. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - ``` +``` From 89f337e63fc59a75b0186ab82dacd196a8e0ee6f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 22:48:51 +0100 Subject: [PATCH 15/17] update CHANGELOG --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bf2336..926e386 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + +* Makefile/tests: fix tests, update to resmgr cwd semantics, add dewarp +* layout-analysis: use correct pageId +* tiseg/layout-analysis: use TF SavedFormat instead of HDF5 +* dewarp/layout-analysis: load during init (`setup` instead of `process`) +* dewarp: fix image input (in-memory instead of file-based) +* dewarp: fix image output (resizing with better quality) +* dewarp: fix/update pix2pixHD for CPU-only and newer PyTorch +* dewarp: rename parameters (now `resize_{mode,width,height}`) +* dewarp: fix oplevel region, update to ocrd_mets changes +* update requirements +* improve README + ## [1.6.0] - 2021-05-20 Removed: From 0bbcb66b3292a6ef99ff1b7ee1ad4e0082865a44 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Feb 2022 23:02:49 +0100 Subject: [PATCH 16/17] layout-analysis: fix parent fornew chapter/section --- ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py index 8fb6e89..4ba6c6e 100755 --- a/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py +++ b/ocrd_anybaseocr/cli/ocrd_anybaseocr_layout_analysis.py @@ -181,6 +181,11 @@ def write_to_mets(self, result, pageID): if self.first is None: self.first = 'chapter' parent_node = self.log_map + # rs: not sure about the remaining branches (cf. #73) + elif self.first == i: + parent_node = self.log_map + else: + parent_node = self.log_links[self.first] log_div = ET.SubElement(parent_node, TAG_METS_DIV) log_div.set('TYPE', str(i)) From 01aea45d409cf271fef3dcc04dbe810258774dd2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 21 Feb 2022 00:21:54 +0100 Subject: [PATCH 17/17] README: explain resmgr download and pip install --- README.md | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ff294e6..e824055 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # Document Preprocessing and Segmentation [![CircleCI](https://circleci.com/gh/OCR-D/ocrd_anybaseocr.svg?style=svg)](https://circleci.com/gh/OCR-D/ocrd_anybaseocr) +[![PyPI](https://img.shields.io/pypi/v/ocrd_anybaseocr.svg)](https://pypi.org/project/ocrd_anybaseocr/) + > Tools to preprocess and segment scanned images for OCR-D @@ -19,6 +21,8 @@ # Installing +Requires Python >= 3.6. + 1. Create a new `venv` unless you already have one python3 -m venv venv @@ -27,13 +31,19 @@ source venv/bin/activate -3. Install with `make` +3. To install from source, get GNU make and do: make install + There are also prebuilds available on PyPI: + + pip install ocrd_anybaseocr + +(This will install both PyTorch and TensorFlow, along with their dependents.) + # Tools -All tools, also called _processors_, abide by the [CLI specifications]((https://ocr-d.de/en/spec/cli)) for [OCR-D](https://ocr-d.de), which roughly looks like: +All tools, also called _processors_, abide by the [CLI specifications](https://ocr-d.de/en/spec/cli) for [OCR-D](https://ocr-d.de), which roughly looks like: ocrd- [-m ] -I -O [-p ]* [-P ]* @@ -84,6 +94,10 @@ The input image has to be binarized for the module to work, and should be croppe Implemented via data-driven methods (neural GAN conditional image model trained with pix2pixHD/Pytorch). +### Models + + ocrd resmgr download ocrd-anybaseocr-dewarp '*' + ### Example ocrd-anybaseocr-dewarp -I OCR-D-CROP -O OCR-D-DEWARP -P resize_mode none -P gpu_id -1 @@ -97,6 +111,10 @@ The input image has to be binarized for the module to work, and should be croppe Implemented via data-driven methods (neural pixel classifier model trained with Tensorflow/Keras). +### Models + + ocrd resmgr download ocrd-anybaseocr-tiseg '*' + ### Example ocrd-anybaseocr-tiseg -I OCR-D-DEWARP -O OCR-D-TISEG -P use_deeplr true @@ -110,9 +128,13 @@ The input image need not be binarized, but should be deskewed for the module to Implemented via data-driven methods (neural Mask-RCNN instance segmentation model trained with Tensorflow/Keras). +### Models + + ocrd resmgr download ocrd-anybaseocr-block-segmentation '*' + ### Example - ocrd-anybaseocr-block-segmenter -I OCR-D-TISEG -O OCR-D-BLOCK -P active_classes '["page-number", "paragraph", "heading", "drop-capital", "marginalia", "caption"]' -P min_confidence 0.8 -P post_process true + ocrd-anybaseocr-block-segmentation -I OCR-D-TISEG -O OCR-D-BLOCK -P active_classes '["page-number", "paragraph", "heading", "drop-capital", "marginalia", "caption"]' -P min_confidence 0.8 -P post_process true ## Textline Segmenter @@ -133,7 +155,13 @@ Implemented via rule-based methods (gradient and morphology based line estimatio For the whole document, this processor takes all the cropped page images and their corresponding text regions as input and computes the logical structure (page types and sections). The input image should be binarized and segmented for this module to work. - + +Implemented via data-driven methods (neural Inception-V3 image classification model trained with Tensorflow/Keras). + +### Models + + ocrd resmgr download ocrd-anybaseocr-layout-analysis '*' + ### Example ocrd-anybaseocr-layout-analysis -I OCR-D-LINE -O OCR-D-STRUCT