diff --git a/README.md b/README.md index 242cff4d59..93cc877b70 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ If both options are set to False, the predictor will always fit and return rotat To interpret your model's predictions, you can visualize them interactively as follows: ```python -result.show(doc) +result.show() ``` ![Visualization sample](docs/images/doctr_example_script.gif) diff --git a/doctr/io/elements.py b/doctr/io/elements.py index 4e92a4043f..c0d522a0a5 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -235,6 +235,7 @@ class Page(Element): Args: ---- + page: image encoded as a numpy array in uint8 blocks: list of block elements page_idx: the index of the page in the input raw document dimensions: the page size in pixels in format (height, width) @@ -248,6 +249,7 @@ class Page(Element): def __init__( self, + page: np.ndarray, blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], @@ -255,6 +257,7 @@ def __init__( language: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(blocks=blocks) + self.page = page self.page_idx = page_idx self.dimensions = dimensions self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) @@ -267,17 +270,15 @@ def render(self, block_break: str = "\n\n") -> str: def extra_repr(self) -> str: return f"dimensions={self.dimensions}" - def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - ---- - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) plt.show(**kwargs) def synthesize(self, **kwargs) -> np.ndarray: @@ -408,6 +409,7 @@ class KIEPage(Element): Args: ---- predictions: Dictionary with list of block elements for each detection class + page: image encoded as a numpy array in uint8 page_idx: the index of the page in the input raw document dimensions: the page size in pixels in format (height, width) orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction @@ -420,6 +422,7 @@ class KIEPage(Element): def __init__( self, + page: np.ndarray, predictions: Dict[str, List[Prediction]], page_idx: int, dimensions: Tuple[int, int], @@ -427,6 +430,7 @@ def __init__( language: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(predictions=predictions) + self.page = page self.page_idx = page_idx self.dimensions = dimensions self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) @@ -441,17 +445,17 @@ def render(self, prediction_break: str = "\n\n") -> str: def extra_repr(self) -> str: return f"dimensions={self.dimensions}" - def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - ---- - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor **kwargs: keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_kie_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + visualize_kie_page( + self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio + ) plt.show(**kwargs) def synthesize(self, **kwargs) -> np.ndarray: @@ -561,16 +565,10 @@ def render(self, page_break: str = "\n\n\n\n") -> str: """Renders the full text of the element""" return page_break.join(p.render() for p in self.pages) - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - ---- - pages: list of images encoded as numpy arrays in uint8 - **kwargs: keyword arguments passed to the Page.show method - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs) + def show(self, **kwargs) -> None: + """Overlay the result on a given image""" + for result in self.pages: + result.show(**kwargs) def synthesize(self, **kwargs) -> List[np.ndarray]: """Synthesize all pages from their predictions diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 1021afdc0c..484538b1a0 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -11,7 +11,7 @@ import numpy as np from langdetect import LangDetectException, detect_langs -__all__ = ["estimate_orientation", "get_bitmap_angle", "get_language", "invert_data_structure"] +__all__ = ["estimate_orientation", "get_language", "invert_data_structure"] def get_max_width_length_ratio(contour: np.ndarray) -> float: @@ -21,19 +21,21 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float: ---- contour: the contour from cv2.findContour - Returns: the maximum shape ratio + Returns: + ------- + the maximum shape ratio """ _, (w, h), _ = cv2.minAreaRect(contour) return max(w / h, h / w) -def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> float: +def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int: """Estimate the angle of the general document orientation based on the lines of the document and the assumption that they should be horizontal. Args: ---- - img: the img to analyze + img: the img or bitmap to analyze (H, W, C) n_ct: the number of contours used for the orientation estimation ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines @@ -41,9 +43,15 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li ------- the angle of the general document orientation """ - gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - gray_img = cv2.medianBlur(gray_img, 5) - thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported" + max_value = np.max(img) + min_value = np.min(img) + if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1): + thresh = img.astype(np.uint8) + if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3: + gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + gray_img = cv2.medianBlur(gray_img, 5) + thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] # try to merge words in lines (h, w) = img.shape[:2] @@ -69,47 +77,8 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li if len(angles) == 0: return 0 # in case no angles is found else: - return -median_low(angles) - - -def get_bitmap_angle(bitmap: np.ndarray, n_ct: int = 20, std_max: float = 3.0) -> float: - """From a binarized segmentation map, find contours and fit min area rectangles to determine page angle - - Args: - ---- - bitmap: binarized segmentation map - n_ct: number of contours to use to fit page angle - std_max: maximum deviation of the angle distribution to consider the mean angle reliable - - Returns: - ------- - The angle of the page - """ - # Find all contours on binarized seg map - contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - # Sort contours - contours = sorted(contours, key=cv2.contourArea, reverse=True) - - # Find largest contours and fit angles - # Track heights and widths to find aspect ratio (determine is rotation is clockwise) - angles, heights, widths = [], [], [] - for ct in contours[:n_ct]: - _, (w, h), alpha = cv2.minAreaRect(ct) - widths.append(w) - heights.append(h) - angles.append(alpha) - - if np.std(angles) > std_max: - # Edge case with angles of both 0 and 90°, or multi_oriented docs - angle = 0.0 - else: - angle = -np.mean(angles) - # Determine rotation direction (clockwise/counterclockwise) - # Angle coverage: [-90°, +90°], half of the quadrant - if np.sum(widths) < np.sum(heights): # CounterClockwise - angle = 90 + angle - - return angle + median = -median_low(angles) + return round(median) if abs(median) != 0 else 0 def rectify_crops( @@ -154,9 +123,13 @@ def rectify_loc_preds( def get_language(text: str) -> Tuple[str, float]: """Get languages of a text using langdetect model. Get the language with the highest probability or no language if only a few words or a low probability + Args: + ---- text (str): text + Returns: + ------- The detected language in ISO 639 code and confidence score """ try: diff --git a/doctr/models/builder.py b/doctr/models/builder.py index 820689bbac..764b48ec37 100644 --- a/doctr/models/builder.py +++ b/doctr/models/builder.py @@ -287,6 +287,7 @@ def extra_repr(self) -> str: def __call__( self, + pages: List[np.ndarray], boxes: List[np.ndarray], text_preds: List[List[Tuple[str, float]]], page_shapes: List[Tuple[int, int]], @@ -297,6 +298,7 @@ def __call__( Args: ---- + pages: list of N elements, where each element represents the page image boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) or (*, 6) for all words for a given page text_preds: list of N elements, where each element is the list of all word prediction (text + confidence) @@ -325,6 +327,7 @@ def __call__( _pages = [ Page( + page, self._build_blocks( page_boxes, word_preds, @@ -334,8 +337,8 @@ def __call__( orientation, language, ) - for _idx, shape, page_boxes, word_preds, orientation, language in zip( - range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages + for page, _idx, shape, page_boxes, word_preds, orientation, language in zip( + pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages ) ] @@ -356,6 +359,7 @@ class KIEDocumentBuilder(DocumentBuilder): def __call__( # type: ignore[override] self, + pages: List[np.ndarray], boxes: List[Dict[str, np.ndarray]], text_preds: List[Dict[str, List[Tuple[str, float]]]], page_shapes: List[Tuple[int, int]], @@ -366,6 +370,7 @@ def __call__( # type: ignore[override] Args: ---- + pages: list of N elements, where each element represents the page image boxes: list of N dictionaries, where each element represents the localization predictions for a class, of shape (*, 5) or (*, 6) for all predictions text_preds: list of N dictionaries, where each element is the list of all word prediction @@ -400,6 +405,7 @@ def __call__( # type: ignore[override] _pages = [ KIEPage( + page, { k: self._build_blocks( page_boxes[k], @@ -412,8 +418,8 @@ def __call__( # type: ignore[override] orientation, language, ) - for _idx, shape, page_boxes, word_preds, orientation, language in zip( - range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages + for page, _idx, shape, page_boxes, word_preds, orientation, language in zip( + pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages ) ] diff --git a/doctr/models/detection/predictor/pytorch.py b/doctr/models/detection/predictor/pytorch.py index 8202abca8d..b78dc4b759 100644 --- a/doctr/models/detection/predictor/pytorch.py +++ b/doctr/models/detection/predictor/pytorch.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -from typing import Any, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch @@ -37,8 +37,9 @@ def __init__( def forward( self, pages: List[Union[np.ndarray, torch.Tensor]], + return_maps: bool = False, **kwargs: Any, - ) -> List[np.ndarray]: + ) -> Union[List[Dict[str, np.ndarray]], Tuple[List[Dict[str, np.ndarray]], List[np.ndarray]]]: # Dimension check if any(page.ndim != 3 for page in pages): raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.") @@ -48,5 +49,13 @@ def forward( self.model, processed_batches = set_device_and_dtype( self.model, processed_batches, _params.device, _params.dtype ) - predicted_batches = [self.model(batch, return_preds=True, **kwargs)["preds"] for batch in processed_batches] - return [pred for batch in predicted_batches for pred in batch] + predicted_batches = [ + self.model(batch, return_preds=True, return_model_output=True, **kwargs) for batch in processed_batches + ] + preds = [pred for batch in predicted_batches for pred in batch["preds"]] + if return_maps: + seg_maps = [ + pred.permute(1, 2, 0).detach().cpu().numpy() for batch in predicted_batches for pred in batch["out_map"] + ] + return preds, seg_maps + return preds diff --git a/doctr/models/detection/predictor/tensorflow.py b/doctr/models/detection/predictor/tensorflow.py index 80251ff96e..d82b9f25f5 100644 --- a/doctr/models/detection/predictor/tensorflow.py +++ b/doctr/models/detection/predictor/tensorflow.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import tensorflow as tf @@ -37,14 +37,21 @@ def __init__( def __call__( self, pages: List[Union[np.ndarray, tf.Tensor]], + return_maps: bool = False, **kwargs: Any, - ) -> List[Dict[str, np.ndarray]]: + ) -> Union[List[Dict[str, np.ndarray]], Tuple[List[Dict[str, np.ndarray]], List[np.ndarray]]]: # Dimension check if any(page.ndim != 3 for page in pages): raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.") processed_batches = self.pre_processor(pages) predicted_batches = [ - self.model(batch, return_preds=True, training=False, **kwargs)["preds"] for batch in processed_batches + self.model(batch, return_preds=True, return_model_output=True, training=False, **kwargs) + for batch in processed_batches ] - return [pred for batch in predicted_batches for pred in batch] + + preds = [pred for batch in predicted_batches for pred in batch["preds"]] + if return_maps: + seg_maps = [pred.numpy() for batch in predicted_batches for pred in batch["out_map"]] + return preds, seg_maps + return preds diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py index 520dcdaf0e..e5dee4fffd 100644 --- a/doctr/models/kie_predictor/pytorch.py +++ b/doctr/models/kie_predictor/pytorch.py @@ -13,7 +13,7 @@ from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from .base import _KIEPredictor @@ -36,7 +36,7 @@ class KIEPredictor(nn.Module, _KIEPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ def __init__( @@ -72,22 +72,33 @@ def forward( origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seg_maps = [ + np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( + np.uint8 + ) + for out_map in out_maps + ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] + # Forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment] # Check whether crop mode should be switched to channels first channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray) @@ -130,27 +141,12 @@ def forward( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes_per_page = [ - { - k: rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[1:], - target_shape=mask, - ) - for k, page_boxes in page_boxes_dict.items() - } - for page_boxes_dict, page, angle, mask in zip( - boxes_per_page, pages, origin_page_orientations, origin_page_shapes - ) - ] out = self.doc_builder( + pages, boxes_per_page, text_preds_per_page, - [page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc] + origin_page_shapes, orientations, languages_dict, ) diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py index d6dca51520..6ac0a6221f 100644 --- a/doctr/models/kie_predictor/tensorflow.py +++ b/doctr/models/kie_predictor/tensorflow.py @@ -12,7 +12,7 @@ from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from doctr.utils.repr import NestedObject from .base import _KIEPredictor @@ -36,7 +36,7 @@ class KIEPredictor(NestedObject, _KIEPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ _children_names = ["det_predictor", "reco_predictor", "doc_builder"] @@ -72,24 +72,34 @@ def __call__( origin_page_shapes = [page.shape[:2] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seg_maps = [ + np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype( + np.uint8 + ) + for out_map in out_maps + ] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] - - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] + # Forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment] + dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore # Rectify crops if aspect ratio dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()} @@ -127,24 +137,9 @@ def __call__( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes_per_page = [ - { - k: rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:], - target_shape=mask, # type: ignore[arg-type] - ) - for k, page_boxes in page_boxes_dict.items() - } - for page_boxes_dict, page, angle, mask in zip( - boxes_per_page, pages, origin_page_orientations, origin_page_shapes - ) - ] out = self.doc_builder( + pages, boxes_per_page, text_preds_per_page, origin_page_shapes, # type: ignore[arg-type] diff --git a/doctr/models/predictor/base.py b/doctr/models/predictor/base.py index 1190606299..4de41e01e0 100644 --- a/doctr/models/predictor/base.py +++ b/doctr/models/predictor/base.py @@ -29,7 +29,7 @@ class _OCRPredictor: accordingly. Doing so will improve performances for documents with page-uniform rotations. preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding) symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ crop_orientation_predictor: Optional[CropOrientationPredictor] diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py index 59b34c8dca..874128c99f 100644 --- a/doctr/models/predictor/pytorch.py +++ b/doctr/models/predictor/pytorch.py @@ -13,7 +13,7 @@ from doctr.models._utils import estimate_orientation, get_language from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from .base import _OCRPredictor @@ -36,7 +36,7 @@ class OCRPredictor(nn.Module, _OCRPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ def __init__( @@ -72,22 +72,28 @@ def forward( origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages] + # Localize text elements + loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seg_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] + # Forward again to get predictions on straight pages + loc_preds = self.det_predictor(pages, **kwargs) - # Localize text elements - loc_preds = self.det_predictor(pages, **kwargs) assert all( len(loc_pred) == 1 for loc_pred in loc_preds ), "Detection Model in ocr_predictor should output only one class" @@ -119,22 +125,12 @@ def forward( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes = [ - rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[1:], - target_shape=mask, - ) - for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes) - ] out = self.doc_builder( + pages, boxes, text_preds, - [page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc] + origin_page_shapes, orientations, languages_dict, ) diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py index 9ac31b3957..5128711502 100644 --- a/doctr/models/predictor/tensorflow.py +++ b/doctr/models/predictor/tensorflow.py @@ -12,7 +12,7 @@ from doctr.models._utils import estimate_orientation, get_language from doctr.models.detection.predictor import DetectionPredictor from doctr.models.recognition.predictor import RecognitionPredictor -from doctr.utils.geometry import rotate_boxes, rotate_image +from doctr.utils.geometry import rotate_image from doctr.utils.repr import NestedObject from .base import _OCRPredictor @@ -36,7 +36,7 @@ class OCRPredictor(NestedObject, _OCRPredictor): page. Doing so will slightly deteriorate the overall latency. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `DocumentBuilder` + **kwargs: keyword args of `DocumentBuilder` """ _children_names = ["det_predictor", "reco_predictor", "doc_builder"] @@ -72,27 +72,32 @@ def __call__( origin_page_shapes = [page.shape[:2] for page in pages] + # Localize text elements + loc_preds_dict, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) + # Detect document rotation and rotate pages + seg_maps = [np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0).astype(np.uint8) for out_map in out_maps] if self.detect_orientation: - origin_page_orientations = [estimate_orientation(page) for page in pages] + origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps] orientations = [ - {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations + {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations ] else: orientations = None if self.straighten_pages: origin_page_orientations = ( - origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages] + origin_page_orientations + if self.detect_orientation + else [estimate_orientation(seq_map) for seq_map in seg_maps] ) - pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)] + pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] + # forward again to get predictions on straight pages + loc_preds_dict = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - # Localize text elements - loc_preds_dict = self.det_predictor(pages, **kwargs) assert all( len(loc_pred) == 1 for loc_pred in loc_preds_dict ), "Detection Model in ocr_predictor should output only one class" - - loc_preds: List[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict] + loc_preds: List[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict] # type: ignore[union-attr] # Rectify crops if aspect ratio loc_preds = self._remove_padding(pages, loc_preds) @@ -115,19 +120,9 @@ def __call__( languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages] else: languages_dict = None - # Rotate back pages and boxes while keeping original image size - if self.straighten_pages: - boxes = [ - rotate_boxes( - page_boxes, - angle, - orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:], - target_shape=mask, # type: ignore[arg-type] - ) - for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes) - ] out = self.doc_builder( + pages, boxes, text_preds, origin_page_shapes, # type: ignore[arg-type] diff --git a/doctr/models/zoo.py b/doctr/models/zoo.py index c7842124e6..1dc131acd7 100644 --- a/doctr/models/zoo.py +++ b/doctr/models/zoo.py @@ -24,6 +24,7 @@ def _predictor( det_bs: int = 2, reco_bs: int = 128, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs, ) -> OCRPredictor: @@ -53,6 +54,7 @@ def _predictor( preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -68,6 +70,7 @@ def ocr_predictor( symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs: Any, ) -> OCRPredictor: @@ -96,6 +99,10 @@ def ocr_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` @@ -114,6 +121,7 @@ def ocr_predictor( symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -130,6 +138,7 @@ def _kie_predictor( det_bs: int = 2, reco_bs: int = 128, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs, ) -> KIEPredictor: @@ -159,6 +168,7 @@ def _kie_predictor( preserve_aspect_ratio=preserve_aspect_ratio, symmetric_pad=symmetric_pad, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) @@ -174,6 +184,7 @@ def kie_predictor( symmetric_pad: bool = True, export_as_straight_boxes: bool = False, detect_orientation: bool = False, + straighten_pages: bool = False, detect_language: bool = False, **kwargs: Any, ) -> KIEPredictor: @@ -202,6 +213,10 @@ def kie_predictor( (potentially rotated) as straight bounding boxes. detect_orientation: if True, the estimated general page orientation will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. detect_language: if True, the language prediction will be added to the predictions for each page. Doing so will slightly deteriorate the overall latency. kwargs: keyword args of `OCRPredictor` @@ -220,6 +235,7 @@ def kie_predictor( symmetric_pad=symmetric_pad, export_as_straight_boxes=export_as_straight_boxes, detect_orientation=detect_orientation, + straighten_pages=straighten_pages, detect_language=detect_language, **kwargs, ) diff --git a/scripts/analyze.py b/scripts/analyze.py index 067ed62685..2e0f19c034 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -31,8 +31,8 @@ def main(args): out = model(doc) - for page, img in zip(out.pages, doc): - page.show(img, block=not args.noblock, interactive=not args.static) + for page in out.pages: + page.show(block=not args.noblock, interactive=not args.static) def parse_args(): diff --git a/tests/common/test_io_elements.py b/tests/common/test_io_elements.py index 965033290a..af982c6e04 100644 --- a/tests/common/test_io_elements.py +++ b/tests/common/test_io_elements.py @@ -72,6 +72,7 @@ def _mock_blocks(size=(1, 1), offset=(0, 0)): def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): return [ elements.Page( + np.random.randint(0, 255, (300, 200, 3), dtype=np.uint8), _mock_blocks(block_size, block_offset), 0, (300, 200), @@ -79,6 +80,7 @@ def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): {"value": "EN", "confidence": 0.8}, ), elements.Page( + np.random.randint(0, 255, (500, 1000, 3), dtype=np.uint8), _mock_blocks(block_size, block_offset), 1, (500, 1000), @@ -91,6 +93,7 @@ def _mock_pages(block_size=(1, 1), block_offset=(0, 0)): def _mock_kie_pages(prediction_size=(1, 1), prediction_offset=(0, 0)): return [ elements.KIEPage( + np.random.randint(0, 255, (300, 200, 3), dtype=np.uint8), {CLASS_NAME: _mock_prediction(prediction_size, prediction_offset)}, 0, (300, 200), @@ -98,6 +101,7 @@ def _mock_kie_pages(prediction_size=(1, 1), prediction_offset=(0, 0)): {"value": "EN", "confidence": 0.8}, ), elements.KIEPage( + np.random.randint(0, 255, (500, 1000, 3), dtype=np.uint8), {CLASS_NAME: _mock_prediction(prediction_size, prediction_offset)}, 1, (500, 1000), @@ -243,16 +247,18 @@ def test_block(): def test_page(): + page = np.zeros((300, 200, 3), dtype=np.uint8) page_idx = 0 page_size = (300, 200) orientation = {"value": 0.0, "confidence": 0.0} language = {"value": "EN", "confidence": 0.8} blocks = _mock_blocks() - page = elements.Page(blocks, page_idx, page_size, orientation, language) + page = elements.Page(page, blocks, page_idx, page_size, orientation, language) # Attribute checks assert len(page.blocks) == len(blocks) assert all(isinstance(b, elements.Block) for b in page.blocks) + assert isinstance(page.page, np.ndarray) assert page.page_idx == page_idx assert page.dimensions == page_size assert page.orientation == orientation @@ -281,7 +287,7 @@ def test_page(): assert "\n".join(repr(page).split("\n")[:2]) == f"Page(\n dimensions={page_size!r}" # Show - page.show(np.zeros((256, 256, 3), dtype=np.uint8), block=False) + page.show(block=False) # Synthesize img = page.synthesize() @@ -290,16 +296,18 @@ def test_page(): def test_kiepage(): + page = np.zeros((300, 200, 3), dtype=np.uint8) page_idx = 0 page_size = (300, 200) orientation = {"value": 0.0, "confidence": 0.0} language = {"value": "EN", "confidence": 0.8} predictions = {CLASS_NAME: _mock_prediction()} - kie_page = elements.KIEPage(predictions, page_idx, page_size, orientation, language) + kie_page = elements.KIEPage(page, predictions, page_idx, page_size, orientation, language) # Attribute checks assert len(kie_page.predictions) == len(predictions) assert all(isinstance(b, elements.Prediction) for b in kie_page.predictions[CLASS_NAME]) + assert isinstance(kie_page.page, np.ndarray) assert kie_page.page_idx == page_idx assert kie_page.dimensions == page_size assert kie_page.orientation == orientation @@ -328,7 +336,7 @@ def test_kiepage(): assert "\n".join(repr(kie_page).split("\n")[:2]) == f"KIEPage(\n dimensions={page_size!r}" # Show - kie_page.show(np.zeros((256, 256, 3), dtype=np.uint8), block=False) + kie_page.show(block=False) # Synthesize img = kie_page.synthesize() @@ -355,7 +363,7 @@ def test_document(): assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) # Show - doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False) + doc.show(block=False) # Synthesize img_list = doc.synthesize() @@ -381,7 +389,7 @@ def test_kie_document(): assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) # Show - doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False) + doc.show(block=False) # Synthesize img_list = doc.synthesize() diff --git a/tests/common/test_models.py b/tests/common/test_models.py index fea26024b1..25fb2c6c5f 100644 --- a/tests/common/test_models.py +++ b/tests/common/test_models.py @@ -6,7 +6,7 @@ import requests from doctr.io import reader -from doctr.models._utils import estimate_orientation, get_bitmap_angle, get_language, invert_data_structure +from doctr.models._utils import estimate_orientation, get_language, invert_data_structure from doctr.utils import geometry @@ -24,16 +24,19 @@ def mock_image(tmpdir_factory): @pytest.fixture(scope="function") def mock_bitmap(mock_image): bitmap = np.squeeze(cv2.cvtColor(mock_image, cv2.COLOR_BGR2GRAY) / 255.0) + bitmap = np.expand_dims(bitmap, axis=-1) return bitmap -def test_get_bitmap_angle(mock_bitmap): - angle = get_bitmap_angle(mock_bitmap) - assert abs(angle - 30.0) < 1.0 +def test_estimate_orientation(mock_image, mock_bitmap, mock_tilted_payslip): + assert estimate_orientation(mock_image * 0) == 0 + # test binarized image + angle = estimate_orientation(mock_bitmap) + assert abs(angle - 30.0) < 1.0 -def test_estimate_orientation(mock_image, mock_tilted_payslip): - assert estimate_orientation(mock_image * 0) == 0 + angle = estimate_orientation(mock_bitmap * 255) + assert abs(angle - 30.0) < 1.0 angle = estimate_orientation(mock_image) assert abs(angle - 30.0) < 1.0 @@ -49,6 +52,9 @@ def test_estimate_orientation(mock_image, mock_tilted_payslip): angle_rotated = estimate_orientation(rotated) assert abs(angle_rotated) < 1.0 + with pytest.raises(AssertionError): + estimate_orientation(np.ones((10, 10, 10))) + def test_get_lang(): sentence = "This is a test sentence." diff --git a/tests/common/test_models_builder.py b/tests/common/test_models_builder.py index 7940bf8a5d..7d233dafb0 100644 --- a/tests/common/test_models_builder.py +++ b/tests/common/test_models_builder.py @@ -20,25 +20,29 @@ def test_documentbuilder(): # Don't resolve lines doc_builder = builder.DocumentBuilder(resolve_lines=False, resolve_blocks=False) + pages = [np.zeros((100, 200, 3))] * num_pages boxes = np.random.rand(words_per_page, 6) # array format boxes[:2] *= boxes[2:4] # Arg consistency check with pytest.raises(ValueError): - doc_builder([boxes, boxes], [("hello", 1.0)] * 3, [(100, 200), (100, 200)]) - out = doc_builder([boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) + doc_builder(pages, [boxes, boxes], [("hello", 1.0)] * 3, [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) assert isinstance(out, Document) assert len(out.pages) == num_pages + assert all(isinstance(page.page, np.ndarray) for page in out.pages) and all( + page.page.shape == (100, 200, 3) for page in out.pages + ) # 1 Block & 1 line per page assert len(out.pages[0].blocks) == 1 and len(out.pages[0].blocks[0].lines) == 1 assert len(out.pages[0].blocks[0].lines[0].words) == words_per_page # Resolve lines doc_builder = builder.DocumentBuilder(resolve_lines=True, resolve_blocks=True) - out = doc_builder([boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[("hello", 1.0)] * words_per_page] * num_pages, [(100, 200), (100, 200)]) # No detection boxes = np.zeros((0, 5)) - out = doc_builder([boxes, boxes], [[], []], [(100, 200), (100, 200)]) + out = doc_builder(pages, [boxes, boxes], [[], []], [(100, 200), (100, 200)]) assert len(out.pages[0].blocks) == 0 # Rotated boxes to export as straight boxes @@ -49,7 +53,7 @@ def test_documentbuilder(): ] ) doc_builder_2 = builder.DocumentBuilder(resolve_blocks=False, resolve_lines=False, export_as_straight_boxes=True) - out = doc_builder_2([boxes], [[("hello", 0.99), ("word", 0.99)]], [(100, 100)]) + out = doc_builder_2([np.zeros((100, 100, 3))], [boxes], [[("hello", 0.99), ("word", 0.99)]], [(100, 100)]) assert out.pages[0].blocks[0].lines[0].words[-1].geometry == ((0.45, 0.5), (0.6, 0.65)) # Repr @@ -64,18 +68,23 @@ def test_kiedocumentbuilder(): # Don't resolve lines doc_builder = builder.KIEDocumentBuilder(resolve_lines=False, resolve_blocks=False) + pages = [np.zeros((100, 200, 3))] * num_pages predictions = {CLASS_NAME: np.random.rand(words_per_page, 6)} # dict format predictions[CLASS_NAME][:2] *= predictions[CLASS_NAME][2:4] # Arg consistency check with pytest.raises(ValueError): - doc_builder([predictions, predictions], [{CLASS_NAME: ("hello", 1.0)}] * 3, [(100, 200), (100, 200)]) + doc_builder(pages, [predictions, predictions], [{CLASS_NAME: ("hello", 1.0)}] * 3, [(100, 200), (100, 200)]) out = doc_builder( + pages, [predictions, predictions], [{CLASS_NAME: [("hello", 1.0)] * words_per_page}] * num_pages, [(100, 200), (100, 200)], ) assert isinstance(out, KIEDocument) assert len(out.pages) == num_pages + assert all(isinstance(page.page, np.ndarray) for page in out.pages) and all( + page.page.shape == (100, 200, 3) for page in out.pages + ) # 1 Block & 1 line per page assert len(out.pages[0].predictions) == 1 assert len(out.pages[0].predictions[CLASS_NAME]) == words_per_page @@ -83,6 +92,7 @@ def test_kiedocumentbuilder(): # Resolve lines doc_builder = builder.KIEDocumentBuilder(resolve_lines=True, resolve_blocks=True) out = doc_builder( + pages, [predictions, predictions], [{CLASS_NAME: [("hello", 1.0)] * words_per_page}] * num_pages, [(100, 200), (100, 200)], @@ -90,7 +100,7 @@ def test_kiedocumentbuilder(): # No detection predictions = {CLASS_NAME: np.zeros((0, 5))} - out = doc_builder([predictions, predictions], [{CLASS_NAME: []}, {CLASS_NAME: []}], [(100, 200), (100, 200)]) + out = doc_builder(pages, [predictions, predictions], [{CLASS_NAME: []}, {CLASS_NAME: []}], [(100, 200), (100, 200)]) assert len(out.pages[0].predictions[CLASS_NAME]) == 0 # Rotated boxes to export as straight boxes @@ -103,7 +113,9 @@ def test_kiedocumentbuilder(): ) } doc_builder_2 = builder.KIEDocumentBuilder(resolve_blocks=False, resolve_lines=False, export_as_straight_boxes=True) - out = doc_builder_2([predictions], [{CLASS_NAME: [("hello", 0.99), ("word", 0.99)]}], [(100, 100)]) + out = doc_builder_2( + [np.zeros((100, 100, 3))], [predictions], [{CLASS_NAME: [("hello", 0.99), ("word", 0.99)]}], [(100, 100)] + ) assert out.pages[0].predictions[CLASS_NAME][0].geometry == ((0.05, 0.1), (0.2, 0.25)) assert out.pages[0].predictions[CLASS_NAME][1].geometry == ((0.45, 0.5), (0.6, 0.65)) diff --git a/tests/pytorch/test_models_detection_pt.py b/tests/pytorch/test_models_detection_pt.py index 39eae65168..8dac82d436 100644 --- a/tests/pytorch/test_models_detection_pt.py +++ b/tests/pytorch/test_models_detection_pt.py @@ -95,9 +95,13 @@ def test_detection_zoo(arch_name): input_tensor = input_tensor.cuda() with torch.no_grad(): - out = predictor(input_tensor) + out, seq_maps = predictor(input_tensor, return_maps=True) assert all(isinstance(boxes, dict) for boxes in out) assert all(isinstance(boxes[CLASS_NAME], np.ndarray) and boxes[CLASS_NAME].shape[1] == 5 for boxes in out) + assert all(isinstance(seq_map, np.ndarray) for seq_map in seq_maps) + assert all(seq_map.shape[:2] == (1024, 1024) for seq_map in seq_maps) + # check that all values in the seq_maps are between 0 and 1 + assert all((seq_map >= 0).all() and (seq_map <= 1).all() for seq_map in seq_maps) def test_erode(): diff --git a/tests/pytorch/test_models_zoo_pt.py b/tests/pytorch/test_models_zoo_pt.py index cefb77176f..fa3f23b9d1 100644 --- a/tests/pytorch/test_models_zoo_pt.py +++ b/tests/pytorch/test_models_zoo_pt.py @@ -73,10 +73,17 @@ def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].orientation["value"] == orientation -def test_trained_ocr_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_ocr_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = OCRPredictor( @@ -90,16 +97,12 @@ def test_trained_ocr_predictor(mock_tilted_payslip): out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr." - geometry_mr = np.array( - [[0.08563021, 0.35584526], [0.11464554, 0.34078913], [0.1274898, 0.36012764], [0.09847447, 0.37518377]] - ) - assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr, rtol=0.05) assert out.pages[0].blocks[1].lines[0].words[-1].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised) + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", @@ -181,10 +184,17 @@ def test_kiepredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].orientation["value"] == orientation -def test_trained_kie_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_kie_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = KIEPredictor( @@ -199,17 +209,12 @@ def test_trained_kie_predictor(mock_tilted_payslip): assert isinstance(out, KIEDocument) assert out.pages[0].predictions[CLASS_NAME][0].value == "Mr." - geometry_mr = np.array( - [[0.08563021, 0.35584526], [0.11464554, 0.34078913], [0.1274898, 0.36012764], [0.09847447, 0.37518377]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05) - print(out.pages[0].predictions[CLASS_NAME]) - assert out.pages[0].predictions[CLASS_NAME][7].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][7].geometry), geometry_revised) + assert out.pages[0].predictions[CLASS_NAME][6].value == "revised" + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][6].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", diff --git a/tests/tensorflow/test_models_detection_tf.py b/tests/tensorflow/test_models_detection_tf.py index ef8d6920ef..d5411f3027 100644 --- a/tests/tensorflow/test_models_detection_tf.py +++ b/tests/tensorflow/test_models_detection_tf.py @@ -146,9 +146,13 @@ def test_detection_zoo(arch_name): # object check assert isinstance(predictor, DetectionPredictor) input_tensor = tf.random.uniform(shape=[2, 1024, 1024, 3], minval=0, maxval=1) - out = predictor(input_tensor) + out, seq_maps = predictor(input_tensor, return_maps=True) assert all(isinstance(boxes, dict) for boxes in out) assert all(isinstance(boxes[CLASS_NAME], np.ndarray) and boxes[CLASS_NAME].shape[1] == 5 for boxes in out) + assert all(isinstance(seq_map, np.ndarray) for seq_map in seq_maps) + assert all(seq_map.shape[:2] == (1024, 1024) for seq_map in seq_maps) + # check that all values in the seq_maps are between 0 and 1 + assert all((seq_map >= 0).all() and (seq_map <= 1).all() for seq_map in seq_maps) def test_detection_zoo_error(): diff --git a/tests/tensorflow/test_models_zoo_tf.py b/tests/tensorflow/test_models_zoo_tf.py index 6d4b85e2c8..32e7988560 100644 --- a/tests/tensorflow/test_models_zoo_tf.py +++ b/tests/tensorflow/test_models_zoo_tf.py @@ -72,10 +72,17 @@ def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].language["value"] == language -def test_trained_ocr_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_ocr_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = OCRPredictor( @@ -89,16 +96,12 @@ def test_trained_ocr_predictor(mock_tilted_payslip): out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr." - geometry_mr = np.array( - [[0.08844472, 0.35763523], [0.11625107, 0.34320644], [0.12588427, 0.35771032], [0.09807791, 0.37213911]] - ) - assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr, rtol=0.05) assert out.pages[0].blocks[1].lines[0].words[-1].value == "revised" - geometry_revised = np.array( - [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] - ) - assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised) + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", @@ -179,10 +182,17 @@ def test_kiepredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pa assert out.pages[0].language["value"] == language -def test_trained_kie_predictor(mock_tilted_payslip): - doc = DocumentFile.from_images(mock_tilted_payslip) +def test_trained_kie_predictor(mock_payslip): + doc = DocumentFile.from_images(mock_payslip) - det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) + det_predictor = detection_predictor( + "db_resnet50", + pretrained=True, + batch_size=2, + assume_straight_pages=True, + symmetric_pad=True, + preserve_aspect_ratio=False, + ) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = KIEPredictor( @@ -197,16 +207,12 @@ def test_trained_kie_predictor(mock_tilted_payslip): assert isinstance(out, KIEDocument) assert out.pages[0].predictions[CLASS_NAME][0].value == "Mr." - geometry_mr = np.array( - [[0.08844472, 0.35763523], [0.11625107, 0.34320644], [0.12588427, 0.35771032], [0.09807791, 0.37213911]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr) + geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05) - assert out.pages[0].predictions[CLASS_NAME][-1].value == "Kabir)" - geometry_revised = np.array( - [[0.43725992, 0.67232439], [0.49045468, 0.64472149], [0.50570724, 0.66768597], [0.452512473, 0.69528887]] - ) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][-1].geometry), geometry_revised) + assert out.pages[0].predictions[CLASS_NAME][3].value == "revised" + geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][3].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50",