From dfe09ebda9f0155f3c80dcee79fe4eff91a812f6 Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 2 Oct 2023 16:50:57 +0200 Subject: [PATCH 01/13] [skip ci] compute orientation on seq map --- doctr/models/_utils.py | 12 ++++------ doctr/models/detection/predictor/pytorch.py | 17 +++++++++---- .../models/detection/predictor/tensorflow.py | 15 ++++++++---- doctr/models/kie_predictor/pytorch.py | 24 ++++++++++++++----- doctr/models/kie_predictor/tensorflow.py | 22 +++++++++++------ doctr/models/predictor/pytorch.py | 21 +++++++++++----- doctr/models/predictor/tensorflow.py | 21 ++++++++++------ doctr/models/zoo.py | 14 +++++++++++ tests/common/test_models.py | 17 ++++--------- 9 files changed, 109 insertions(+), 54 deletions(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 1021afdc0c..304fa9a7e8 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -27,13 +27,12 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float: return max(w / h, h / w) -def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: +def estimate_orientation(seq_map: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: """Estimate the angle of the general document orientation based on the lines of the document and the assumption that they should be horizontal. Args: - ---- - img: the img to analyze + seq_map: the binarized image of the document n_ct: the number of contours used for the orientation estimation ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines @@ -41,16 +40,13 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li ------- the angle of the general document orientation """ - gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - gray_img = cv2.medianBlur(gray_img, 5) - thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] # try to merge words in lines - (h, w) = img.shape[:2] + (h, w) = seq_map.shape[:2] k_x = max(1, (floor(w / 100))) k_y = max(1, (floor(h / 100))) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y)) - thresh = cv2.dilate(thresh, kernel, iterations=1) + thresh = cv2.dilate(seq_map, kernel, iterations=1) # extract contours contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) diff --git a/doctr/models/detection/predictor/pytorch.py b/doctr/models/detection/predictor/pytorch.py index 8202abca8d..34f26f03e4 100644 --- a/doctr/models/detection/predictor/pytorch.py +++ b/doctr/models/detection/predictor/pytorch.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -from typing import Any, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch @@ -37,8 +37,9 @@ def __init__( def forward( self, pages: List[Union[np.ndarray, torch.Tensor]], + return_maps: bool = False, **kwargs: Any, - ) -> List[np.ndarray]: + ) -> Union[List[Dict[str, np.ndarray]], Tuple[List[Dict[str, np.ndarray]], List[np.ndarray]]]: # Dimension check if any(page.ndim != 3 for page in pages): raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.") @@ -48,5 +49,13 @@ def forward( self.model, processed_batches = set_device_and_dtype( self.model, processed_batches, _params.device, _params.dtype ) - predicted_batches = [self.model(batch, return_preds=True, **kwargs)["preds"] for batch in processed_batches] - return [pred for batch in predicted_batches for pred in batch] + predicted_batches = [ + self.model(batch, return_preds=True, return_model_output=True, **kwargs) for batch in processed_batches + ] + preds = [pred for batch in predicted_batches for pred in batch["preds"]] + seq_maps = [ + pred.permute(1, 2, 0).detach().cpu().numpy() for batch in predicted_batches for pred in batch["out_map"] + ] + if return_maps: + return preds, seq_maps + return preds diff --git a/doctr/models/detection/predictor/tensorflow.py b/doctr/models/detection/predictor/tensorflow.py index 80251ff96e..6317a61874 100644 --- a/doctr/models/detection/predictor/tensorflow.py +++ b/doctr/models/detection/predictor/tensorflow.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import tensorflow as tf @@ -37,14 +37,21 @@ def __init__( def __call__( self, pages: List[Union[np.ndarray, tf.Tensor]], + return_maps: bool = False, **kwargs: Any, - ) -> List[Dict[str, np.ndarray]]: + ) -> Union[List[Dict[str, np.ndarray]], Tuple[List[Dict[str, np.ndarray]], List[np.ndarray]]]: # Dimension check if any(page.ndim != 3 for page in pages): raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.") processed_batches = self.pre_processor(pages) predicted_batches = [ - self.model(batch, return_preds=True, training=False, **kwargs)["preds"] for batch in processed_batches + self.model(batch, return_preds=True, return_model_output=True, training=False, **kwargs) + for batch in processed_batches ] - return [pred for batch in predicted_batches for pred in batch] + + preds = [pred for batch in predicted_batches for pred in batch["preds"]] + seq_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] + if return_maps: + return preds, seq_maps + return preds diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py index 520dcdaf0e..9d7c0ef088 100644 --- a/doctr/models/kie_predictor/pytorch.py +++ b/doctr/models/kie_predictor/pytorch.py @@ -72,22 +72,34 @@ def forward( origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seq_maps = [ + np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8) + for out_map in out_maps + ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seq_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [ + rotate_image(page, -angle, expand=True) # type: ignore[arg-type] + for page, angle in zip(pages, origin_page_orientations) + ] + # forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment] # Check whether crop mode should be switched to channels first channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray) diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py index d6dca51520..a1c1f8d2c3 100644 --- a/doctr/models/kie_predictor/tensorflow.py +++ b/doctr/models/kie_predictor/tensorflow.py @@ -72,24 +72,32 @@ def __call__( origin_page_shapes = [page.shape[:2] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_preds=True, **kwargs) + # Detect document rotation and rotate pages + seq_maps = [ + np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8) + for out_map in out_maps + ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seq_maps] ) pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + # forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) - - dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment] + dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore # Rectify crops if aspect ratio dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()} diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py index 59b34c8dca..c16d404166 100644 --- a/doctr/models/predictor/pytorch.py +++ b/doctr/models/predictor/pytorch.py @@ -72,22 +72,31 @@ def forward( origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seq_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seq_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [ + rotate_image(page, -angle, expand=True) # type: ignore[arg-type] + for page, angle in zip(pages, origin_page_orientations) + ] + # forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) assert all( len(loc_pred) == 1 for loc_pred in loc_preds ), "Detection Model in ocr_predictor should output only one class" diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py index 9ac31b3957..a1d66cdb66 100644 --- a/doctr/models/predictor/tensorflow.py +++ b/doctr/models/predictor/tensorflow.py @@ -72,27 +72,34 @@ def __call__( origin_page_shapes = [page.shape[:2] for page in pages] + # Localize text elements + loc_preds_dict, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seq_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seq_maps] ) pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + # forward again to get predictions on straight pages + loc_preds_dict = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - # Localize text elements - loc_preds_dict = self.det_predictor(pages, **kwargs) assert all( len(loc_pred) == 1 for loc_pred in loc_preds_dict ), "Detection Model in ocr_predictor should output only one class" - - loc_preds: List[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict] + loc_preds: List[np.ndarray] = [ + list(loc_pred.values())[0] for loc_pred in loc_preds_dict # type: ignore[union-attr] + ] # Rectify crops if aspect ratio loc_preds = self._remove_padding(pages, loc_preds) diff --git a/doctr/models/zoo.py b/doctr/models/zoo.py index c7842124e6..da806227c0 100644 --- a/doctr/models/zoo.py +++ b/doctr/models/zoo.py @@ -24,6 +24,7 @@ def _predictor( det_bs: int = 2, reco_bs: int = 128, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs, ) -> OCRPredictor: @@ -53,6 +54,7 @@ def _predictor( preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -68,6 +70,7 @@ def ocr_predictor( symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs: Any, ) -> OCRPredictor: @@ -96,6 +99,9 @@ def ocr_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation based on the median line orientation. + Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped + accordingly. Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` @@ -114,6 +120,7 @@ def ocr_predictor( symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -130,6 +137,7 @@ def _kie_predictor( det_bs: int = 2, reco_bs: int = 128, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs, ) -> KIEPredictor: @@ -159,6 +167,7 @@ def _kie_predictor( preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -174,6 +183,7 @@ def kie_predictor( symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs: Any, ) -> KIEPredictor: @@ -202,6 +212,9 @@ def kie_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation based on the median line orientation. + Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped + accordingly. Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` @@ -220,6 +233,7 @@ def kie_predictor( symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) diff --git a/tests/common/test_models.py b/tests/common/test_models.py index fea26024b1..ab1dc35323 100644 --- a/tests/common/test_models.py +++ b/tests/common/test_models.py @@ -23,7 +23,7 @@ def mock_image(tmpdir_factory): @pytest.fixture(scope="function") def mock_bitmap(mock_image): - bitmap = np.squeeze(cv2.cvtColor(mock_image, cv2.COLOR_BGR2GRAY) / 255.0) + bitmap = np.squeeze(cv2.cvtColor(mock_image, cv2.COLOR_BGR2GRAY)) return bitmap @@ -32,20 +32,13 @@ def test_get_bitmap_angle(mock_bitmap): assert abs(angle - 30.0) < 1.0 -def test_estimate_orientation(mock_image, mock_tilted_payslip): - assert estimate_orientation(mock_image * 0) == 0 +def test_estimate_orientation(mock_bitmap): + assert estimate_orientation(mock_bitmap * 0) == 0 - angle = estimate_orientation(mock_image) + angle = estimate_orientation(mock_bitmap) assert abs(angle - 30.0) < 1.0 - rotated = geometry.rotate_image(mock_image, -angle) - angle_rotated = estimate_orientation(rotated) - assert abs(angle_rotated) < 1.0 - - mock_tilted_payslip = reader.read_img_as_numpy(mock_tilted_payslip) - assert (estimate_orientation(mock_tilted_payslip) - 30.0) < 1.0 - - rotated = geometry.rotate_image(mock_tilted_payslip, -30, expand=True) + rotated = geometry.rotate_image(mock_bitmap, -angle) angle_rotated = estimate_orientation(rotated) assert abs(angle_rotated) < 1.0 From eeb63eca6c18e3457950da68881508eda7f4abf8 Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 4 Oct 2023 11:03:10 +0200 Subject: [PATCH 02/13] [skip ci] some updates and fixes --- README.md | 2 +- doctr/io/elements.py | 35 +++++++++++------------- doctr/models/builder.py | 16 +++++++---- doctr/models/kie_predictor/pytorch.py | 29 ++++++-------------- doctr/models/kie_predictor/tensorflow.py | 29 ++++++-------------- doctr/models/predictor/pytorch.py | 20 ++++---------- doctr/models/predictor/tensorflow.py | 16 ++--------- tests/common/test_io_elements.py | 20 ++++++++++---- tests/common/test_models_builder.py | 22 +++++++++------ 9 files changed, 79 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 242cff4d59..93cc877b70 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ If both options are set to False, the predictor will always fit and return rotat To interpret your model's predictions, you can visualize them interactively as follows: ```python -result.show(doc) +result.show() ``` ![Visualization sample](docs/images/doctr_example_script.gif) diff --git a/doctr/io/elements.py b/doctr/io/elements.py index 4e92a4043f..d94ade587e 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -234,7 +234,7 @@ class Page(Element): """Implements a page element as a collection of blocks Args: - ---- + page: image encoded as a numpy array in uint8 blocks: list of block elements page_idx: the index of the page in the input raw document dimensions: the page size in pixels in format (height, width) @@ -248,6 +248,7 @@ class Page(Element): def __init__( self, + page: np.ndarray, blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], @@ -255,6 +256,7 @@ def __init__( language: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(blocks=blocks) + self.page = page self.page_idx = page_idx self.dimensions = dimensions self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) @@ -267,17 +269,15 @@ def render(self, block_break: str = "\n\n") -> str: def extra_repr(self) -> str: return f"dimensions={self.dimensions}" - def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - ---- - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) plt.show(**kwargs) def synthesize(self, **kwargs) -> np.ndarray: @@ -408,6 +408,7 @@ class KIEPage(Element): Args: ---- predictions: Dictionary with list of block elements for each detection class + page: image encoded as a numpy array in uint8 page_idx: the index of the page in the input raw document dimensions: the page size in pixels in format (height, width) orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction @@ -420,6 +421,7 @@ class KIEPage(Element): def __init__( self, + page: np.ndarray, predictions: Dict[str, List[Prediction]], page_idx: int, dimensions: Tuple[int, int], @@ -427,6 +429,7 @@ def __init__( language: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(predictions=predictions) + self.page = page self.page_idx = page_idx self.dimensions = dimensions self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) @@ -441,17 +444,17 @@ def render(self, prediction_break: str = "\n\n") -> str: def extra_repr(self) -> str: return f"dimensions={self.dimensions}" - def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - ---- - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor **kwargs: keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_kie_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + visualize_kie_page( + self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio + ) plt.show(**kwargs) def synthesize(self, **kwargs) -> np.ndarray: @@ -561,16 +564,10 @@ def render(self, page_break: str = "\n\n\n\n") -> str: """Renders the full text of the element""" return page_break.join(p.render() for p in self.pages) - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - ---- - pages: list of images encoded as numpy arrays in uint8 - **kwargs: keyword arguments passed to the Page.show method - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs) + def show(self, **kwargs) -> None: + """Overlay the result on a given image""" + for result in self.pages: + result.show(**kwargs) def synthesize(self, **kwargs) -> List[np.ndarray]: """Synthesize all pages from their predictions diff --git a/doctr/models/builder.py b/doctr/models/builder.py index 820689bbac..b974f7c0db 100644 --- a/doctr/models/builder.py +++ b/doctr/models/builder.py @@ -287,6 +287,7 @@ def extra_repr(self) -> str: def __call__( self, + pages: List[np.ndarray], boxes: List[np.ndarray], text_preds: List[List[Tuple[str, float]]], page_shapes: List[Tuple[int, int]], @@ -296,7 +297,7 @@ def __call__( """Re-arrange detected words into structured blocks Args: - ---- + pages: list of N elements, where each element represents the page image boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) or (*, 6) for all words for a given page text_preds: list of N elements, where each element is the list of all word prediction (text + confidence) @@ -325,6 +326,7 @@ def __call__( _pages = [ Page( + page, self._build_blocks( page_boxes, word_preds, @@ -334,8 +336,8 @@ def __call__( orientation, language, ) - for _idx, shape, page_boxes, word_preds, orientation, language in zip( - range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages + for page, _idx, shape, page_boxes, word_preds, orientation, language in zip( + pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages ) ] @@ -356,6 +358,7 @@ class KIEDocumentBuilder(DocumentBuilder): def __call__( # type: ignore[override] self, + pages: List[np.ndarray], boxes: List[Dict[str, np.ndarray]], text_preds: List[Dict[str, List[Tuple[str, float]]]], page_shapes: List[Tuple[int, int]], @@ -365,7 +368,7 @@ def __call__( # type: ignore[override] """Re-arrange detected words into structured predictions Args: - ---- + pages: list of N elements, where each element represents the page image boxes: list of N dictionaries, where each element represents the localization predictions for a class, of shape (*, 5) or (*, 6) for all predictions text_preds: list of N dictionaries, where each element is the list of all word prediction @@ -400,6 +403,7 @@ def __call__( # type: ignore[override] _pages = [ KIEPage( + page, { k: self._build_blocks( page_boxes[k], @@ -412,8 +416,8 @@ def __call__( # type: ignore[override] orientation, language, ) - for _idx, shape, page_boxes, word_preds, orientation, language in zip( - range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages + for page, _idx, shape, page_boxes, word_preds, orientation, language in zip( + pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages ) ] diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py index 9d7c0ef088..957a7270b3 100644 --- a/doctr/models/kie_predictor/pytorch.py +++ b/doctr/models/kie_predictor/pytorch.py @@ -13,7 +13,7 @@ from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from .base import _KIEPredictor @@ -77,7 +77,9 @@ def forward( # Detect document rotation and rotate pages seq_maps = [ - np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8) + np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( + np.uint8 + ) for out_map in out_maps ] if self.detect_orientation: @@ -94,10 +96,10 @@ def forward( else [estimate_orientation(seq_map) for seq_map in seq_maps] ) pages = [ - rotate_image(page, -angle, expand=True) # type: ignore[arg-type] + rotate_image(page, -angle, expand=False) # type: ignore[arg-type] for page, angle in zip(pages, origin_page_orientations) ] - # forward again to get predictions on straight pages + # Forward again to get predictions on straight pages loc_preds = self.det_predictor(pages, **kwargs) dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment] @@ -142,27 +144,12 @@ def forward( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes_per_page = [ - { - k: rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[1:], - target_shape=mask, - ) - for k, page_boxes in page_boxes_dict.items() - } - for page_boxes_dict, page, angle, mask in zip( - boxes_per_page, pages, origin_page_orientations, origin_page_shapes - ) - ] out = self.doc_builder( + pages, # type: ignore[arg-type] boxes_per_page, text_preds_per_page, - [page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc] + origin_page_shapes, # type: ignore[arg-type] orientations, languages_dict, ) diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py index a1c1f8d2c3..26c22c66c2 100644 --- a/doctr/models/kie_predictor/tensorflow.py +++ b/doctr/models/kie_predictor/tensorflow.py @@ -12,7 +12,7 @@ from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from doctr.utils.repr import NestedObject from .base import _KIEPredictor @@ -73,11 +73,13 @@ def __call__( origin_page_shapes = [page.shape[:2] for page in pages] # Localize text elements - loc_preds, out_maps = self.det_predictor(pages, return_preds=True, **kwargs) + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) # Detect document rotation and rotate pages seq_maps = [ - np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8) + np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( + np.uint8 + ) for out_map in out_maps ] if self.detect_orientation: @@ -93,8 +95,8 @@ def __call__( if self.detect_orientation else [estimate_orientation(seq_map) for seq_map in seq_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] - # forward again to get predictions on straight pages + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] + # Forward again to get predictions on straight pages loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore @@ -135,24 +137,9 @@ def __call__( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes_per_page = [ - { - k: rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:], - target_shape=mask, # type: ignore[arg-type] - ) - for k, page_boxes in page_boxes_dict.items() - } - for page_boxes_dict, page, angle, mask in zip( - boxes_per_page, pages, origin_page_orientations, origin_page_shapes - ) - ] out = self.doc_builder( + pages, boxes_per_page, text_preds_per_page, origin_page_shapes, # type: ignore[arg-type] diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py index c16d404166..a55e4c3cc4 100644 --- a/doctr/models/predictor/pytorch.py +++ b/doctr/models/predictor/pytorch.py @@ -13,7 +13,7 @@ from doctr.models._utils import estimate_orientation, get_language from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from .base import _OCRPredictor @@ -91,10 +91,10 @@ def forward( else [estimate_orientation(seq_map) for seq_map in seq_maps] ) pages = [ - rotate_image(page, -angle, expand=True) # type: ignore[arg-type] + rotate_image(page, -angle, expand=False) # type: ignore[arg-type] for page, angle in zip(pages, origin_page_orientations) ] - # forward again to get predictions on straight pages + # Forward again to get predictions on straight pages loc_preds = self.det_predictor(pages, **kwargs) assert all( @@ -128,22 +128,12 @@ def forward( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes = [ - rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[1:], - target_shape=mask, - ) - for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes) - ] out = self.doc_builder( + pages, # type: ignore[arg-type] boxes, text_preds, - [page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc] + origin_page_shapes, # type: ignore[arg-type] orientations, languages_dict, ) diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py index a1d66cdb66..14a80ecfd8 100644 --- a/doctr/models/predictor/tensorflow.py +++ b/doctr/models/predictor/tensorflow.py @@ -12,7 +12,7 @@ from doctr.models._utils import estimate_orientation, get_language from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from doctr.utils.repr import NestedObject from .base import _OCRPredictor @@ -90,7 +90,7 @@ def __call__( if self.detect_orientation else [estimate_orientation(seq_map) for seq_map in seq_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # forward again to get predictions on straight pages loc_preds_dict = self.det_predictor(pages, **kwargs) # type: ignore[assignment] @@ -122,19 +122,9 @@ def __call__( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes = [ - rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:], - target_shape=mask, # type: ignore[arg-type] - ) - for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes) - ] out = self.doc_builder( + pages, boxes, text_preds, origin_page_shapes, # type: ignore[arg-type] diff --git a/tests/common/test_io_elements.py b/tests/common/test_io_elements.py index 965033290a..af982c6e04 100644 --- a/tests/common/test_io_elements.py +++ b/tests/common/test_io_elements.py @@ -72,6 +72,7 @@ def _mock_blocks(size=(1, 1), offset=(0, 0)): def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): return [ elements.Page( + np.random.randint(0, 255, (300, 200, 3), dtype=np.uint8), _mock_blocks(block_size, block_offset), 0, (300, 200), @@ -79,6 +80,7 @@ def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): {"value": "EN", "confidence": 0.8}, ), elements.Page( + np.random.randint(0, 255, (500, 1000, 3), dtype=np.uint8), _mock_blocks(block_size, block_offset), 1, (500, 1000), @@ -91,6 +93,7 @@ def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): def _mock_kie_pages(prediction_size=(1, 1), prediction_offset=(0, 0)): return [ elements.KIEPage( + np.random.randint(0, 255, (300, 200, 3), dtype=np.uint8), {CLASS_NAME: _mock_prediction(prediction_size, prediction_offset)}, 0, (300, 200), @@ -98,6 +101,7 @@ def _mock_kie_pages(prediction_size=(1, 1), prediction_offset=(0, 0)): {"value": "EN", "confidence": 0.8}, ), elements.KIEPage( + np.random.randint(0, 255, (500, 1000, 3), dtype=np.uint8), {CLASS_NAME: _mock_prediction(prediction_size, prediction_offset)}, 1, (500, 1000), @@ -243,16 +247,18 @@ def test_block(): def test_page(): + page = np.zeros((300, 200, 3), dtype=np.uint8) page_idx = 0 page_size = (300, 200) orientation = {"value": 0.0, "confidence": 0.0} language = {"value": "EN", "confidence": 0.8} blocks = _mock_blocks() - page = elements.Page(blocks, page_idx, page_size, orientation, language) + page = elements.Page(page, blocks, page_idx, page_size, orientation, language) # Attribute checks assert len(page.blocks) == len(blocks) assert all(isinstance(b, elements.Block) for b in page.blocks) + assert isinstance(page.page, np.ndarray) assert page.page_idx == page_idx assert page.dimensions == page_size assert page.orientation == orientation @@ -281,7 +287,7 @@ def test_page(): assert "\n".join(repr(page).split("\n")[:2]) == f"Page(\n dimensions={page_size!r}" # Show - page.show(np.zeros((256, 256, 3), dtype=np.uint8), block=False) + page.show(block=False) # Synthesize img = page.synthesize() @@ -290,16 +296,18 @@ def test_page(): def test_kiepage(): + page = np.zeros((300, 200, 3), dtype=np.uint8) page_idx = 0 page_size = (300, 200) orientation = {"value": 0.0, "confidence": 0.0} language = {"value": "EN", "confidence": 0.8} predictions = {CLASS_NAME: _mock_prediction()} - kie_page = elements.KIEPage(predictions, page_idx, page_size, orientation, language) + kie_page = elements.KIEPage(page, predictions, page_idx, page_size, orientation, language) # Attribute checks assert len(kie_page.predictions) == len(predictions) assert all(isinstance(b, elements.Prediction) for b in kie_page.predictions[CLASS_NAME]) + assert isinstance(kie_page.page, np.ndarray) assert kie_page.page_idx == page_idx assert kie_page.dimensions == page_size assert kie_page.orientation == orientation @@ -328,7 +336,7 @@ def test_kiepage(): assert "\n".join(repr(kie_page).split("\n")[:2]) == f"KIEPage(\n dimensions={page_size!r}" # Show - kie_page.show(np.zeros((256, 256, 3), dtype=np.uint8), block=False) + kie_page.show(block=False) # Synthesize img = kie_page.synthesize() @@ -355,7 +363,7 @@ def test_document(): assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) # Show - doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False) + doc.show(block=False) # Synthesize img_list = doc.synthesize() @@ -381,7 +389,7 @@ def test_kie_document(): assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) # Show - doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False) + doc.show(block=False) # Synthesize img_list = doc.synthesize() diff --git a/tests/common/test_models_builder.py b/tests/common/test_models_builder.py index 7940bf8a5d..90c681b5f6 100644 --- a/tests/common/test_models_builder.py +++ b/tests/common/test_models_builder.py @@ -20,12 +20,13 @@ def test_documentbuilder(): # Don't resolve lines doc_builder = builder.DocumentBuilder(resolve_lines=False, resolve_blocks=False) + pages = [np.zeros((100, 200, 3))] * num_pages boxes = np.random.rand(words_per_page, 6) # array format boxes[:2] *= boxes[2:4] # Arg consistency check with pytest.raises(ValueError): - doc_builder([boxes, boxes], [("hello", 1.0)] * 3, [(100, 200), (100, 200)]) - out = doc_builder([boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) + doc_builder(pages, [boxes, boxes], [("hello", 1.0)] * 3, [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) assert isinstance(out, Document) assert len(out.pages) == num_pages # 1 Block & 1 line per page @@ -34,11 +35,11 @@ def test_documentbuilder(): # Resolve lines doc_builder = builder.DocumentBuilder(resolve_lines=True, resolve_blocks=True) - out = doc_builder([boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) # No detection boxes = np.zeros((0, 5)) - out = doc_builder([boxes, boxes], [[], []], [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[], []], [(100, 200), (100, 200)]) assert len(out.pages[0].blocks) == 0 # Rotated boxes to export as straight boxes @@ -49,7 +50,7 @@ def test_documentbuilder(): ] ) doc_builder_2 = builder.DocumentBuilder(resolve_blocks=False, resolve_lines=False, export_as_straight_boxes=True) - out = doc_builder_2([boxes], [[("hello", 0.99), ("word", 0.99)]], [(100, 100)]) + out = doc_builder_2([np.zeros((100, 100, 3))], [boxes], [[("hello", 0.99), ("word", 0.99)]], [(100, 100)]) assert out.pages[0].blocks[0].lines[0].words[-1].geometry == ((0.45, 0.5), (0.6, 0.65)) # Repr @@ -64,12 +65,14 @@ def test_kiedocumentbuilder(): # Don't resolve lines doc_builder = builder.KIEDocumentBuilder(resolve_lines=False, resolve_blocks=False) + pages = [np.zeros((100, 200, 3))] * num_pages predictions = {CLASS_NAME: np.random.rand(words_per_page, 6)} # dict format predictions[CLASS_NAME][:2] *= predictions[CLASS_NAME][2:4] # Arg consistency check with pytest.raises(ValueError): - doc_builder([predictions, predictions], [{CLASS_NAME: ("hello", 1.0)}] * 3, [(100, 200), (100, 200)]) + doc_builder(pages, [predictions, predictions], [{CLASS_NAME: ("hello", 1.0)}] * 3, [(100, 200), (100, 200)]) out = doc_builder( + pages, [predictions, predictions], [{CLASS_NAME: [("hello", 1.0)] * words_per_page}] * num_pages, [(100, 200), (100, 200)], @@ -83,6 +86,7 @@ def test_kiedocumentbuilder(): # Resolve lines doc_builder = builder.KIEDocumentBuilder(resolve_lines=True, resolve_blocks=True) out = doc_builder( + pages, [predictions, predictions], [{CLASS_NAME: [("hello", 1.0)] * words_per_page}] * num_pages, [(100, 200), (100, 200)], @@ -90,7 +94,7 @@ def test_kiedocumentbuilder(): # No detection predictions = {CLASS_NAME: np.zeros((0, 5))} - out = doc_builder([predictions, predictions], [{CLASS_NAME: []}, {CLASS_NAME: []}], [(100, 200), (100, 200)]) + out = doc_builder(pages, [predictions, predictions], [{CLASS_NAME: []}, {CLASS_NAME: []}], [(100, 200), (100, 200)]) assert len(out.pages[0].predictions[CLASS_NAME]) == 0 # Rotated boxes to export as straight boxes @@ -103,7 +107,9 @@ def test_kiedocumentbuilder(): ) } doc_builder_2 = builder.KIEDocumentBuilder(resolve_blocks=False, resolve_lines=False, export_as_straight_boxes=True) - out = doc_builder_2([predictions], [{CLASS_NAME: [("hello", 0.99), ("word", 0.99)]}], [(100, 100)]) + out = doc_builder_2( + [np.zeros((100, 100, 3))], [predictions], [{CLASS_NAME: [("hello", 0.99), ("word", 0.99)]}], [(100, 100)] + ) assert out.pages[0].predictions[CLASS_NAME][0].geometry == ((0.05, 0.1), (0.2, 0.25)) assert out.pages[0].predictions[CLASS_NAME][1].geometry == ((0.45, 0.5), (0.6, 0.65)) From a21c4afb307d585aae9675f0185efde3960c2b9d Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 4 Oct 2023 16:26:59 +0200 Subject: [PATCH 03/13] update tests --- tests/pytorch/test_models_zoo_pt.py | 53 ++++++++++++++------------ tests/tensorflow/test_models_zoo_tf.py | 52 ++++++++++++++----------- 2 files changed, 58 insertions(+), 47 deletions(-) diff --git a/tests/pytorch/test_models_zoo_pt.py b/tests/pytorch/test_models_zoo_pt.py index cefb77176f..fa3f23b9d1 100644 --- a/tests/pytorch/test_models_zoo_pt.py +++ b/tests/pytorch/test_models_zoo_pt.py @@ -73,10 +73,17 @@ def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].orientation["value"] == orientation -def test_trained_ocr_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_ocr_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = OCRPredictor( @@ -90,16 +97,12 @@ def test_trained_ocr_predictor(mock_tilted_payslip): out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr." - geometry_mr = np.array( - [[0.08563021, 0.35584526], [0.11464554, 0.34078913], [0.1274898, 0.36012764], [0.09847447, 0.37518377]] - ) - assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr, rtol=0.05) assert out.pages[0].blocks[1].lines[0].words[-1].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised) + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", @@ -181,10 +184,17 @@ def test_kiepredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].orientation["value"] == orientation -def test_trained_kie_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_kie_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = KIEPredictor( @@ -199,17 +209,12 @@ def test_trained_kie_predictor(mock_tilted_payslip): assert isinstance(out, KIEDocument) assert out.pages[0].predictions[CLASS_NAME][0].value == "Mr." - geometry_mr = np.array( - [[0.08563021, 0.35584526], [0.11464554, 0.34078913], [0.1274898, 0.36012764], [0.09847447, 0.37518377]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05) - print(out.pages[0].predictions[CLASS_NAME]) - assert out.pages[0].predictions[CLASS_NAME][7].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][7].geometry), geometry_revised) + assert out.pages[0].predictions[CLASS_NAME][6].value == "revised" + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][6].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", diff --git a/tests/tensorflow/test_models_zoo_tf.py b/tests/tensorflow/test_models_zoo_tf.py index 6d4b85e2c8..32e7988560 100644 --- a/tests/tensorflow/test_models_zoo_tf.py +++ b/tests/tensorflow/test_models_zoo_tf.py @@ -72,10 +72,17 @@ def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].language["value"] == language -def test_trained_ocr_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_ocr_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = OCRPredictor( @@ -89,16 +96,12 @@ def test_trained_ocr_predictor(mock_tilted_payslip): out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr." - geometry_mr = np.array( - [[0.08844472, 0.35763523], [0.11625107, 0.34320644], [0.12588427, 0.35771032], [0.09807791, 0.37213911]] - ) - assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr, rtol=0.05) assert out.pages[0].blocks[1].lines[0].words[-1].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised) + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", @@ -179,10 +182,17 @@ def test_kiepredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].language["value"] == language -def test_trained_kie_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_kie_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = KIEPredictor( @@ -197,16 +207,12 @@ def test_trained_kie_predictor(mock_tilted_payslip): assert isinstance(out, KIEDocument) assert out.pages[0].predictions[CLASS_NAME][0].value == "Mr." - geometry_mr = np.array( - [[0.08844472, 0.35763523], [0.11625107, 0.34320644], [0.12588427, 0.35771032], [0.09807791, 0.37213911]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05) - assert out.pages[0].predictions[CLASS_NAME][-1].value == "Kabir)" - geometry_revised = np.array( - [[0.43725992, 0.67232439], [0.49045468, 0.64472149], [0.50570724, 0.66768597], [0.452512473, 0.69528887]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][-1].geometry), geometry_revised) + assert out.pages[0].predictions[CLASS_NAME][3].value == "revised" + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][3].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", From 6183a44c074a81b5220f1c211dd2509c14e51e34 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 5 Oct 2023 09:26:01 +0200 Subject: [PATCH 04/13] update some parts --- doctr/models/_utils.py | 57 ++++--------------- doctr/models/detection/predictor/pytorch.py | 4 +- .../models/detection/predictor/tensorflow.py | 4 +- doctr/models/kie_predictor/pytorch.py | 6 +- doctr/models/kie_predictor/tensorflow.py | 6 +- doctr/models/predictor/pytorch.py | 6 +- doctr/models/predictor/tensorflow.py | 6 +- scripts/analyze.py | 4 +- tests/common/test_models.py | 28 ++++++--- tests/pytorch/test_models_detection_pt.py | 6 +- tests/tensorflow/test_models_detection_tf.py | 6 +- 11 files changed, 59 insertions(+), 74 deletions(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 304fa9a7e8..2b189f216a 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -11,7 +11,7 @@ import numpy as np from langdetect import LangDetectException, detect_langs -__all__ = ["estimate_orientation", "get_bitmap_angle", "get_language", "invert_data_structure"] +__all__ = ["estimate_orientation", "get_language", "invert_data_structure"] def get_max_width_length_ratio(contour: np.ndarray) -> float: @@ -27,12 +27,12 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float: return max(w / h, h / w) -def estimate_orientation(seq_map: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: +def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: """Estimate the angle of the general document orientation based on the lines of the document and the assumption that they should be horizontal. Args: - seq_map: the binarized image of the document + img: the img or bitmap to analyze (H, W, C) n_ct: the number of contours used for the orientation estimation ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines @@ -41,12 +41,19 @@ def estimate_orientation(seq_map: np.ndarray, n_ct: int = 50, ratio_threshold_fo the angle of the general document orientation """ + if np.max(img) <= 1 and np.min(img) >= 0 or (np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 1): + thresh = img.astype(np.uint8) + if np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 3: + gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + gray_img = cv2.medianBlur(gray_img, 5) + thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + # try to merge words in lines - (h, w) = seq_map.shape[:2] + (h, w) = img.shape[:2] k_x = max(1, (floor(w / 100))) k_y = max(1, (floor(h / 100))) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y)) - thresh = cv2.dilate(seq_map, kernel, iterations=1) + thresh = cv2.dilate(thresh, kernel, iterations=1) # extract contours contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) @@ -68,46 +75,6 @@ def estimate_orientation(seq_map: np.ndarray, n_ct: int = 50, ratio_threshold_fo return -median_low(angles) -def get_bitmap_angle(bitmap: np.ndarray, n_ct: int = 20, std_max: float = 3.0) -> float: - """From a binarized segmentation map, find contours and fit min area rectangles to determine page angle - - Args: - ---- - bitmap: binarized segmentation map - n_ct: number of contours to use to fit page angle - std_max: maximum deviation of the angle distribution to consider the mean angle reliable - - Returns: - ------- - The angle of the page - """ - # Find all contours on binarized seg map - contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - # Sort contours - contours = sorted(contours, key=cv2.contourArea, reverse=True) - - # Find largest contours and fit angles - # Track heights and widths to find aspect ratio (determine is rotation is clockwise) - angles, heights, widths = [], [], [] - for ct in contours[:n_ct]: - _, (w, h), alpha = cv2.minAreaRect(ct) - widths.append(w) - heights.append(h) - angles.append(alpha) - - if np.std(angles) > std_max: - # Edge case with angles of both 0 and 90°, or multi_oriented docs - angle = 0.0 - else: - angle = -np.mean(angles) - # Determine rotation direction (clockwise/counterclockwise) - # Angle coverage: [-90°, +90°], half of the quadrant - if np.sum(widths) < np.sum(heights): # CounterClockwise - angle = 90 + angle - - return angle - - def rectify_crops( crops: List[np.ndarray], orientations: List[int], diff --git a/doctr/models/detection/predictor/pytorch.py b/doctr/models/detection/predictor/pytorch.py index 34f26f03e4..05a6426503 100644 --- a/doctr/models/detection/predictor/pytorch.py +++ b/doctr/models/detection/predictor/pytorch.py @@ -53,9 +53,9 @@ def forward( self.model(batch, return_preds=True, return_model_output=True, **kwargs) for batch in processed_batches ] preds = [pred for batch in predicted_batches for pred in batch["preds"]] - seq_maps = [ + seg_maps = [ pred.permute(1, 2, 0).detach().cpu().numpy() for batch in predicted_batches for pred in batch["out_map"] ] if return_maps: - return preds, seq_maps + return preds, seg_maps return preds diff --git a/doctr/models/detection/predictor/tensorflow.py b/doctr/models/detection/predictor/tensorflow.py index 6317a61874..30a1d3aba1 100644 --- a/doctr/models/detection/predictor/tensorflow.py +++ b/doctr/models/detection/predictor/tensorflow.py @@ -51,7 +51,7 @@ def __call__( ] preds = [pred for batch in predicted_batches for pred in batch["preds"]] - seq_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] + seg_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] if return_maps: - return preds, seq_maps + return preds, seg_maps return preds diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py index 957a7270b3..115d8668db 100644 --- a/doctr/models/kie_predictor/pytorch.py +++ b/doctr/models/kie_predictor/pytorch.py @@ -76,14 +76,14 @@ def forward( loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) # Detect document rotation and rotate pages - seq_maps = [ + seg_maps = [ np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( np.uint8 ) for out_map in out_maps ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] @@ -93,7 +93,7 @@ def forward( origin_page_orientations = ( origin_page_orientations if self.detect_orientation - else [estimate_orientation(seq_map) for seq_map in seq_maps] + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) pages = [ rotate_image(page, -angle, expand=False) # type: ignore[arg-type] diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py index 26c22c66c2..94d6ccab58 100644 --- a/doctr/models/kie_predictor/tensorflow.py +++ b/doctr/models/kie_predictor/tensorflow.py @@ -76,14 +76,14 @@ def __call__( loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) # Detect document rotation and rotate pages - seq_maps = [ + seg_maps = [ np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( np.uint8 ) for out_map in out_maps ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] @@ -93,7 +93,7 @@ def __call__( origin_page_orientations = ( origin_page_orientations if self.detect_orientation - else [estimate_orientation(seq_map) for seq_map in seq_maps] + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # Forward again to get predictions on straight pages diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py index a55e4c3cc4..350444fba3 100644 --- a/doctr/models/predictor/pytorch.py +++ b/doctr/models/predictor/pytorch.py @@ -76,9 +76,9 @@ def forward( loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) # Detect document rotation and rotate pages - seq_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] + seg_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] @@ -88,7 +88,7 @@ def forward( origin_page_orientations = ( origin_page_orientations if self.detect_orientation - else [estimate_orientation(seq_map) for seq_map in seq_maps] + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) pages = [ rotate_image(page, -angle, expand=False) # type: ignore[arg-type] diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py index 14a80ecfd8..5f747a01d4 100644 --- a/doctr/models/predictor/tensorflow.py +++ b/doctr/models/predictor/tensorflow.py @@ -76,9 +76,9 @@ def __call__( loc_preds_dict, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) # Detect document rotation and rotate pages - seq_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] + seg_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seq_maps] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] @@ -88,7 +88,7 @@ def __call__( origin_page_orientations = ( origin_page_orientations if self.detect_orientation - else [estimate_orientation(seq_map) for seq_map in seq_maps] + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # forward again to get predictions on straight pages diff --git a/scripts/analyze.py b/scripts/analyze.py index 067ed62685..2e0f19c034 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -31,8 +31,8 @@ def main(args): out = model(doc) - for page, img in zip(out.pages, doc): - page.show(img, block=not args.noblock, interactive=not args.static) + for page in out.pages: + page.show(block=not args.noblock, interactive=not args.static) def parse_args(): diff --git a/tests/common/test_models.py b/tests/common/test_models.py index ab1dc35323..556b734990 100644 --- a/tests/common/test_models.py +++ b/tests/common/test_models.py @@ -6,7 +6,7 @@ import requests from doctr.io import reader -from doctr.models._utils import estimate_orientation, get_bitmap_angle, get_language, invert_data_structure +from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.utils import geometry @@ -23,22 +23,32 @@ def mock_image(tmpdir_factory): @pytest.fixture(scope="function") def mock_bitmap(mock_image): - bitmap = np.squeeze(cv2.cvtColor(mock_image, cv2.COLOR_BGR2GRAY)) + bitmap = np.squeeze(cv2.cvtColor(mock_image, cv2.COLOR_BGR2GRAY) / 255.0) + bitmap = np.expand_dims(bitmap, axis=-1) return bitmap -def test_get_bitmap_angle(mock_bitmap): - angle = get_bitmap_angle(mock_bitmap) - assert abs(angle - 30.0) < 1.0 +def test_estimate_orientation(mock_image, mock_bitmap, mock_tilted_payslip): + assert estimate_orientation(mock_image * 0) == 0 + # test binarized image + angle = estimate_orientation(mock_bitmap) + assert abs(angle - 30.0) < 1.0 -def test_estimate_orientation(mock_bitmap): - assert estimate_orientation(mock_bitmap * 0) == 0 + angle = estimate_orientation(mock_bitmap * 255) + assert abs(angle - 30.0) < 1.0 - angle = estimate_orientation(mock_bitmap) + angle = estimate_orientation(mock_image) assert abs(angle - 30.0) < 1.0 - rotated = geometry.rotate_image(mock_bitmap, -angle) + rotated = geometry.rotate_image(mock_image, -angle) + angle_rotated = estimate_orientation(rotated) + assert abs(angle_rotated) < 1.0 + + mock_tilted_payslip = reader.read_img_as_numpy(mock_tilted_payslip) + assert (estimate_orientation(mock_tilted_payslip) - 30.0) < 1.0 + + rotated = geometry.rotate_image(mock_tilted_payslip, -30, expand=True) angle_rotated = estimate_orientation(rotated) assert abs(angle_rotated) < 1.0 diff --git a/tests/pytorch/test_models_detection_pt.py b/tests/pytorch/test_models_detection_pt.py index 39eae65168..8dac82d436 100644 --- a/tests/pytorch/test_models_detection_pt.py +++ b/tests/pytorch/test_models_detection_pt.py @@ -95,9 +95,13 @@ def test_detection_zoo(arch_name): input_tensor = input_tensor.cuda() with torch.no_grad(): - out = predictor(input_tensor) + out, seq_maps = predictor(input_tensor, return_maps=True) assert all(isinstance(boxes, dict) for boxes in out) assert all(isinstance(boxes[CLASS_NAME], np.ndarray) and boxes[CLASS_NAME].shape[1] == 5 for boxes in out) + assert all(isinstance(seq_map, np.ndarray) for seq_map in seq_maps) + assert all(seq_map.shape[:2] == (1024, 1024) for seq_map in seq_maps) + # check that all values in the seq_maps are between 0 and 1 + assert all((seq_map >= 0).all() and (seq_map <= 1).all() for seq_map in seq_maps) def test_erode(): diff --git a/tests/tensorflow/test_models_detection_tf.py b/tests/tensorflow/test_models_detection_tf.py index ef8d6920ef..d5411f3027 100644 --- a/tests/tensorflow/test_models_detection_tf.py +++ b/tests/tensorflow/test_models_detection_tf.py @@ -146,9 +146,13 @@ def test_detection_zoo(arch_name): # object check assert isinstance(predictor, DetectionPredictor) input_tensor = tf.random.uniform(shape=[2, 1024, 1024, 3], minval=0, maxval=1) - out = predictor(input_tensor) + out, seq_maps = predictor(input_tensor, return_maps=True) assert all(isinstance(boxes, dict) for boxes in out) assert all(isinstance(boxes[CLASS_NAME], np.ndarray) and boxes[CLASS_NAME].shape[1] == 5 for boxes in out) + assert all(isinstance(seq_map, np.ndarray) for seq_map in seq_maps) + assert all(seq_map.shape[:2] == (1024, 1024) for seq_map in seq_maps) + # check that all values in the seq_maps are between 0 and 1 + assert all((seq_map >= 0).all() and (seq_map <= 1).all() for seq_map in seq_maps) def test_detection_zoo_error(): From 768e00e703fa65135acb210fb9639621176e1948 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 5 Oct 2023 11:50:00 +0200 Subject: [PATCH 05/13] more checks and tests --- doctr/models/_utils.py | 4 +++- tests/common/test_models.py | 3 +++ tests/common/test_models_builder.py | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 2b189f216a..0f9cfb1c01 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -41,6 +41,7 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li the angle of the general document orientation """ + assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported" if np.max(img) <= 1 and np.min(img) >= 0 or (np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 1): thresh = img.astype(np.uint8) if np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 3: @@ -72,7 +73,8 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li if len(angles) == 0: return 0 # in case no angles is found else: - return -median_low(angles) + median = -median_low(angles) + return median if median != 0 else 0 def rectify_crops( diff --git a/tests/common/test_models.py b/tests/common/test_models.py index 556b734990..25fb2c6c5f 100644 --- a/tests/common/test_models.py +++ b/tests/common/test_models.py @@ -52,6 +52,9 @@ def test_estimate_orientation(mock_image, mock_bitmap, mock_tilted_payslip): angle_rotated = estimate_orientation(rotated) assert abs(angle_rotated) < 1.0 + with pytest.raises(AssertionError): + estimate_orientation(np.ones((10, 10, 10))) + def test_get_lang(): sentence = "This is a test sentence." diff --git a/tests/common/test_models_builder.py b/tests/common/test_models_builder.py index 90c681b5f6..0a8edadb39 100644 --- a/tests/common/test_models_builder.py +++ b/tests/common/test_models_builder.py @@ -29,6 +29,9 @@ def test_documentbuilder(): out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) assert isinstance(out, Document) assert len(out.pages) == num_pages + assert all([isinstance(page.page, np.ndarray) for page in out.pages]) and all( + [page.page.shape == (100, 200, 3) for page in out.pages] + ) # 1 Block & 1 line per page assert len(out.pages[0].blocks) == 1 and len(out.pages[0].blocks[0].lines) == 1 assert len(out.pages[0].blocks[0].lines[0].words) == words_per_page @@ -79,6 +82,9 @@ def test_kiedocumentbuilder(): ) assert isinstance(out, KIEDocument) assert len(out.pages) == num_pages + assert all([isinstance(page.page, np.ndarray) for page in out.pages]) and all( + [page.page.shape == (100, 200, 3) for page in out.pages] + ) # 1 Block & 1 line per page assert len(out.pages[0].predictions) == 1 assert len(out.pages[0].predictions[CLASS_NAME]) == words_per_page From 2e2f397963c356d0192c134866ac79b8002bc52c Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 5 Oct 2023 11:54:28 +0200 Subject: [PATCH 06/13] correct median --- doctr/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 0f9cfb1c01..ab2ec63715 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -74,7 +74,7 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li return 0 # in case no angles is found else: median = -median_low(angles) - return median if median != 0 else 0 + return median if abs(median) != 0 else 0 def rectify_crops( From 13b90cd2cbbd8fd17333c082488a482d15f8d73e Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 12 Oct 2023 11:01:05 +0200 Subject: [PATCH 07/13] rebase From 0cbadb5232595ce85cf6396bea3a1ae72ee5d013 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 13 Oct 2023 15:34:23 +0200 Subject: [PATCH 08/13] round angle to int --- doctr/models/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index ab2ec63715..ec93c56f7e 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -27,7 +27,7 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float: return max(w / h, h / w) -def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: +def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int: """Estimate the angle of the general document orientation based on the lines of the document and the assumption that they should be horizontal. @@ -74,7 +74,7 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li return 0 # in case no angles is found else: median = -median_low(angles) - return median if abs(median) != 0 else 0 + return round(median) if abs(median) != 0 else 0 def rectify_crops( From 54e960467c086437a2882d7d99c50199a7406daf Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 16 Oct 2023 08:19:12 +0200 Subject: [PATCH 09/13] update doc string --- doctr/models/zoo.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doctr/models/zoo.py b/doctr/models/zoo.py index da806227c0..1dc131acd7 100644 --- a/doctr/models/zoo.py +++ b/doctr/models/zoo.py @@ -99,9 +99,10 @@ def ocr_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - straighten_pages: if True, estimates the page general orientation based on the median line orientation. - Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped - accordingly. Doing so will improve performances for documents with page-uniform rotations. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` @@ -212,9 +213,10 @@ def kie_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - straighten_pages: if True, estimates the page general orientation based on the median line orientation. - Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped - accordingly. Doing so will improve performances for documents with page-uniform rotations. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` From 3fc93cff2a5eabfc9539c4fbe50dec2546478a61 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 10 Nov 2023 08:42:33 +0100 Subject: [PATCH 10/13] rebase From d5c23b3a468e6a8377f66dde0ca4f2f2fab3f9a0 Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 15 Nov 2023 10:19:32 +0100 Subject: [PATCH 11/13] rebase --- doctr/models/_utils.py | 10 ++++++++-- doctr/models/detection/predictor/pytorch.py | 6 +++--- doctr/models/detection/predictor/tensorflow.py | 2 +- doctr/models/kie_predictor/pytorch.py | 11 ++++------- doctr/models/kie_predictor/tensorflow.py | 2 +- doctr/models/predictor/base.py | 2 +- doctr/models/predictor/pytorch.py | 11 ++++------- doctr/models/predictor/tensorflow.py | 6 ++---- tests/common/test_models_builder.py | 8 ++++---- 9 files changed, 28 insertions(+), 30 deletions(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index ec93c56f7e..71828d3151 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -21,7 +21,9 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float: ---- contour: the contour from cv2.findContour - Returns: the maximum shape ratio + Returns: + ------- + the maximum shape ratio """ _, (w, h), _ = cv2.minAreaRect(contour) return max(w / h, h / w) @@ -32,6 +34,7 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li lines of the document and the assumption that they should be horizontal. Args: + ---- img: the img or bitmap to analyze (H, W, C) n_ct: the number of contours used for the orientation estimation ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines @@ -40,7 +43,6 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li ------- the angle of the general document orientation """ - assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported" if np.max(img) <= 1 and np.min(img) >= 0 or (np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 1): thresh = img.astype(np.uint8) @@ -119,9 +121,13 @@ def rectify_loc_preds( def get_language(text: str) -> Tuple[str, float]: """Get languages of a text using langdetect model. Get the language with the highest probability or no language if only a few words or a low probability + Args: + ---- text (str): text + Returns: + ------- The detected language in ISO 639 code and confidence score """ try: diff --git a/doctr/models/detection/predictor/pytorch.py b/doctr/models/detection/predictor/pytorch.py index 05a6426503..b78dc4b759 100644 --- a/doctr/models/detection/predictor/pytorch.py +++ b/doctr/models/detection/predictor/pytorch.py @@ -53,9 +53,9 @@ def forward( self.model(batch, return_preds=True, return_model_output=True, **kwargs) for batch in processed_batches ] preds = [pred for batch in predicted_batches for pred in batch["preds"]] - seg_maps = [ - pred.permute(1, 2, 0).detach().cpu().numpy() for batch in predicted_batches for pred in batch["out_map"] - ] if return_maps: + seg_maps = [ + pred.permute(1, 2, 0).detach().cpu().numpy() for batch in predicted_batches for pred in batch["out_map"] + ] return preds, seg_maps return preds diff --git a/doctr/models/detection/predictor/tensorflow.py b/doctr/models/detection/predictor/tensorflow.py index 30a1d3aba1..d82b9f25f5 100644 --- a/doctr/models/detection/predictor/tensorflow.py +++ b/doctr/models/detection/predictor/tensorflow.py @@ -51,7 +51,7 @@ def __call__( ] preds = [pred for batch in predicted_batches for pred in batch["preds"]] - seg_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] if return_maps: + seg_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] return preds, seg_maps return preds diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py index 115d8668db..e5dee4fffd 100644 --- a/doctr/models/kie_predictor/pytorch.py +++ b/doctr/models/kie_predictor/pytorch.py @@ -36,7 +36,7 @@ class KIEPredictor(nn.Module, _KIEPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ def __init__( @@ -95,10 +95,7 @@ def forward( if self.detect_orientation else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [ - rotate_image(page, -angle, expand=False) # type: ignore[arg-type] - for page, angle in zip(pages, origin_page_orientations) - ] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # Forward again to get predictions on straight pages loc_preds = self.det_predictor(pages, **kwargs) @@ -146,10 +143,10 @@ def forward( languages_dict = None out = self.doc_builder( - pages, # type: ignore[arg-type] + pages, boxes_per_page, text_preds_per_page, - origin_page_shapes, # type: ignore[arg-type] + origin_page_shapes, orientations, languages_dict, ) diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py index 94d6ccab58..6ac0a6221f 100644 --- a/doctr/models/kie_predictor/tensorflow.py +++ b/doctr/models/kie_predictor/tensorflow.py @@ -36,7 +36,7 @@ class KIEPredictor(NestedObject, _KIEPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ _children_names = ["det_predictor", "reco_predictor", "doc_builder"] diff --git a/doctr/models/predictor/base.py b/doctr/models/predictor/base.py index 1190606299..4de41e01e0 100644 --- a/doctr/models/predictor/base.py +++ b/doctr/models/predictor/base.py @@ -29,7 +29,7 @@ class _OCRPredictor: accordingly. Doing so will improve performances for documents with page-uniform rotations. preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding) symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ crop_orientation_predictor: Optional[CropOrientationPredictor] diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py index 350444fba3..874128c99f 100644 --- a/doctr/models/predictor/pytorch.py +++ b/doctr/models/predictor/pytorch.py @@ -36,7 +36,7 @@ class OCRPredictor(nn.Module, _OCRPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ def __init__( @@ -90,10 +90,7 @@ def forward( if self.detect_orientation else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [ - rotate_image(page, -angle, expand=False) # type: ignore[arg-type] - for page, angle in zip(pages, origin_page_orientations) - ] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # Forward again to get predictions on straight pages loc_preds = self.det_predictor(pages, **kwargs) @@ -130,10 +127,10 @@ def forward( languages_dict = None out = self.doc_builder( - pages, # type: ignore[arg-type] + pages, boxes, text_preds, - origin_page_shapes, # type: ignore[arg-type] + origin_page_shapes, orientations, languages_dict, ) diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py index 5f747a01d4..5128711502 100644 --- a/doctr/models/predictor/tensorflow.py +++ b/doctr/models/predictor/tensorflow.py @@ -36,7 +36,7 @@ class OCRPredictor(NestedObject, _OCRPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ _children_names = ["det_predictor", "reco_predictor", "doc_builder"] @@ -97,9 +97,7 @@ def __call__( assert all( len(loc_pred) == 1 for loc_pred in loc_preds_dict ), "Detection Model in ocr_predictor should output only one class" - loc_preds: List[np.ndarray] = [ - list(loc_pred.values())[0] for loc_pred in loc_preds_dict # type: ignore[union-attr] - ] + loc_preds: List[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict] # type: ignore[union-attr] # Rectify crops if aspect ratio loc_preds = self._remove_padding(pages, loc_preds) diff --git a/tests/common/test_models_builder.py b/tests/common/test_models_builder.py index 0a8edadb39..7d233dafb0 100644 --- a/tests/common/test_models_builder.py +++ b/tests/common/test_models_builder.py @@ -29,8 +29,8 @@ def test_documentbuilder(): out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) assert isinstance(out, Document) assert len(out.pages) == num_pages - assert all([isinstance(page.page, np.ndarray) for page in out.pages]) and all( - [page.page.shape == (100, 200, 3) for page in out.pages] + assert all(isinstance(page.page, np.ndarray) for page in out.pages) and all( + page.page.shape == (100, 200, 3) for page in out.pages ) # 1 Block & 1 line per page assert len(out.pages[0].blocks) == 1 and len(out.pages[0].blocks[0].lines) == 1 @@ -82,8 +82,8 @@ def test_kiedocumentbuilder(): ) assert isinstance(out, KIEDocument) assert len(out.pages) == num_pages - assert all([isinstance(page.page, np.ndarray) for page in out.pages]) and all( - [page.page.shape == (100, 200, 3) for page in out.pages] + assert all(isinstance(page.page, np.ndarray) for page in out.pages) and all( + page.page.shape == (100, 200, 3) for page in out.pages ) # 1 Block & 1 line per page assert len(out.pages[0].predictions) == 1 From 375486db4309d84a0f9f7b367fb5a43a9f862c12 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 16 Nov 2023 15:13:03 +0100 Subject: [PATCH 12/13] fix docstrings --- doctr/io/elements.py | 1 + doctr/models/builder.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doctr/io/elements.py b/doctr/io/elements.py index d94ade587e..c0d522a0a5 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -234,6 +234,7 @@ class Page(Element): """Implements a page element as a collection of blocks Args: + ---- page: image encoded as a numpy array in uint8 blocks: list of block elements page_idx: the index of the page in the input raw document diff --git a/doctr/models/builder.py b/doctr/models/builder.py index b974f7c0db..764b48ec37 100644 --- a/doctr/models/builder.py +++ b/doctr/models/builder.py @@ -297,6 +297,7 @@ def __call__( """Re-arrange detected words into structured blocks Args: + ---- pages: list of N elements, where each element represents the page image boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) or (*, 6) for all words for a given page @@ -368,6 +369,7 @@ def __call__( # type: ignore[override] """Re-arrange detected words into structured predictions Args: + ---- pages: list of N elements, where each element represents the page image boxes: list of N dictionaries, where each element represents the localization predictions for a class, of shape (*, 5) or (*, 6) for all predictions From 727099d7e20f622aa1f7a1dd62452ebe6d001827 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 16 Nov 2023 18:12:30 +0100 Subject: [PATCH 13/13] apply suggestion --- doctr/models/_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 71828d3151..484538b1a0 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -44,9 +44,11 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li the angle of the general document orientation """ assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported" - if np.max(img) <= 1 and np.min(img) >= 0 or (np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 1): + max_value = np.max(img) + min_value = np.min(img) + if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1): thresh = img.astype(np.uint8) - if np.max(img) <= 255 and np.min(img) >= 0 and img.shape[-1] == 3: + if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3: gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray_img = cv2.medianBlur(gray_img, 5) thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]