[skip ci] some updates and fixes

mindee · Oct 4, 2023 · 1edd231 · 1edd231
1 parent 92f939e
commit 1edd231
Show file tree

Hide file tree

Showing 9 changed files with 79 additions and 107 deletions.
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ If both options are set to False, the predictor will always fit and return rotat
 To interpret your model's predictions, you can visualize them interactively as follows:
 
 ```python
-result.show(doc)
+result.show()
 ```
 
 ![Visualization sample](docs/images/doctr_example_script.gif)

diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -231,6 +231,7 @@ class Page(Element):
     """Implements a page element as a collection of blocks
 
     Args:
+        page: image encoded as a numpy array in uint8
         blocks: list of block elements
         page_idx: the index of the page in the input raw document
         dimensions: the page size in pixels in format (height, width)
@@ -244,13 +245,15 @@ class Page(Element):
 
     def __init__(
         self,
+        page: np.ndarray,
         blocks: List[Block],
         page_idx: int,
         dimensions: Tuple[int, int],
         orientation: Optional[Dict[str, Any]] = None,
         language: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__(blocks=blocks)
+        self.page = page
         self.page_idx = page_idx
         self.dimensions = dimensions
         self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
@@ -263,15 +266,14 @@ def render(self, block_break: str = "\n\n") -> str:
     def extra_repr(self) -> str:
         return f"dimensions={self.dimensions}"
 
-    def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
+    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
         """Overlay the result on a given image
 
         Args:
-            page: image encoded as a numpy array in uint8
             interactive: whether the display should be interactive
             preserve_aspect_ratio: pass True if you passed True to the predictor
         """
-        visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
+        visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
         plt.show(**kwargs)
 
     def synthesize(self, **kwargs) -> np.ndarray:
@@ -399,6 +401,7 @@ class KIEPage(Element):
 
     Args:
         predictions: Dictionary with list of block elements for each detection class
+        page: image encoded as a numpy array in uint8
         page_idx: the index of the page in the input raw document
         dimensions: the page size in pixels in format (height, width)
         orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
@@ -411,13 +414,15 @@ class KIEPage(Element):
 
     def __init__(
         self,
+        page: np.ndarray,
         predictions: Dict[str, List[Prediction]],
         page_idx: int,
         dimensions: Tuple[int, int],
         orientation: Optional[Dict[str, Any]] = None,
         language: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__(predictions=predictions)
+        self.page = page
         self.page_idx = page_idx
         self.dimensions = dimensions
         self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
@@ -432,15 +437,16 @@ def render(self, prediction_break: str = "\n\n") -> str:
     def extra_repr(self) -> str:
         return f"dimensions={self.dimensions}"
 
-    def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
+    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
         """Overlay the result on a given image
 
         Args:
-            page: image encoded as a numpy array in uint8
             interactive: whether the display should be interactive
             preserve_aspect_ratio: pass True if you passed True to the predictor
         """
-        visualize_kie_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
+        visualize_kie_page(
+            self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
+        )
         plt.show(**kwargs)
 
     def synthesize(self, **kwargs) -> np.ndarray:
@@ -543,14 +549,10 @@ def render(self, page_break: str = "\n\n\n\n") -> str:
         """Renders the full text of the element"""
         return page_break.join(p.render() for p in self.pages)
 
-    def show(self, pages: List[np.ndarray], **kwargs) -> None:
-        """Overlay the result on a given image
-
-        Args:
-            pages: list of images encoded as numpy arrays in uint8
-        """
-        for img, result in zip(pages, self.pages):
-            result.show(img, **kwargs)
+    def show(self, **kwargs) -> None:
+        """Overlay the result on a given image"""
+        for result in self.pages:
+            result.show(**kwargs)
 
     def synthesize(self, **kwargs) -> List[np.ndarray]:
         """Synthesize all pages from their predictions

diff --git a/doctr/models/builder.py b/doctr/models/builder.py
@@ -280,6 +280,7 @@ def extra_repr(self) -> str:
 
     def __call__(
         self,
+        pages: List[np.ndarray],
         boxes: List[np.ndarray],
         text_preds: List[List[Tuple[str, float]]],
         page_shapes: List[Tuple[int, int]],
@@ -289,6 +290,7 @@ def __call__(
         """Re-arrange detected words into structured blocks
 
         Args:
+            pages: list of N elements, where each element represents the page image
             boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
                 or (*, 6) for all words for a given page
             text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
@@ -316,6 +318,7 @@ def __call__(
 
         _pages = [
             Page(
+                page,
                 self._build_blocks(
                     page_boxes,
                     word_preds,
@@ -325,8 +328,8 @@ def __call__(
                 orientation,
                 language,
             )
-            for _idx, shape, page_boxes, word_preds, orientation, language in zip(
-                range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
+            for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
+                pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
             )
         ]
 
@@ -346,6 +349,7 @@ class KIEDocumentBuilder(DocumentBuilder):
 
     def __call__(  # type: ignore[override]
         self,
+        pages: List[np.ndarray],
         boxes: List[Dict[str, np.ndarray]],
         text_preds: List[Dict[str, List[Tuple[str, float]]]],
         page_shapes: List[Tuple[int, int]],
@@ -355,6 +359,7 @@ def __call__(  # type: ignore[override]
         """Re-arrange detected words into structured predictions
 
         Args:
+            pages: list of N elements, where each element represents the page image
             boxes: list of N dictionaries, where each element represents the localization predictions for a class,
             of shape (*, 5) or (*, 6) for all predictions
             text_preds: list of N dictionaries, where each element is the list of all word prediction
@@ -384,6 +389,7 @@ def __call__(  # type: ignore[override]
 
         _pages = [
             KIEPage(
+                page,
                 {
                     k: self._build_blocks(
                         page_boxes[k],
@@ -396,8 +402,8 @@ def __call__(  # type: ignore[override]
                 orientation,
                 language,
             )
-            for _idx, shape, page_boxes, word_preds, orientation, language in zip(
-                range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
+            for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
+                pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
             )
         ]
 

diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py
@@ -13,7 +13,7 @@
 from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 
 from .base import _KIEPredictor
 
@@ -76,7 +76,9 @@ def forward(
 
         # Detect document rotation and rotate pages
         seq_maps = [
-            np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8)
+            np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
+                np.uint8
+            )
             for out_map in out_maps
         ]
         if self.detect_orientation:
@@ -93,10 +95,10 @@ def forward(
                 else [estimate_orientation(seq_map) for seq_map in seq_maps]
             )
             pages = [
-                rotate_image(page, -angle, expand=True)  # type: ignore[arg-type]
+                rotate_image(page, -angle, expand=False)  # type: ignore[arg-type]
                 for page, angle in zip(pages, origin_page_orientations)
             ]
-            # forward again to get predictions on straight pages
+            # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
 
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore[assignment]
@@ -143,29 +145,12 @@ def forward(
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes_per_page = [
-                {
-                    k: rotate_boxes(
-                        page_boxes,
-                        angle,
-                        orig_shape=page.shape[:2]
-                        if isinstance(page, np.ndarray)
-                        else page.shape[1:],  # type: ignore[arg-type]
-                        target_shape=mask,  # type: ignore[arg-type]
-                    )
-                    for k, page_boxes in page_boxes_dict.items()
-                }
-                for page_boxes_dict, page, angle, mask in zip(
-                    boxes_per_page, pages, origin_page_orientations, origin_page_shapes
-                )
-            ]
 
         out = self.doc_builder(
+            pages,  # type: ignore[arg-type]
             boxes_per_page,
             text_preds_per_page,
-            [page.shape[:2] if channels_last else page.shape[-2:] for page in pages],  # type: ignore[misc]
+            origin_page_shapes,  # type: ignore[arg-type]
             orientations,
             languages_dict,
         )

diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py
@@ -12,7 +12,7 @@
 from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 from doctr.utils.repr import NestedObject
 
 from .base import _KIEPredictor
@@ -72,11 +72,13 @@ def __call__(
         origin_page_shapes = [page.shape[:2] for page in pages]
 
         # Localize text elements
-        loc_preds, out_maps = self.det_predictor(pages, return_preds=True, **kwargs)
+        loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
 
         # Detect document rotation and rotate pages
         seq_maps = [
-            np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8)
+            np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
+                np.uint8
+            )
             for out_map in out_maps
         ]
         if self.detect_orientation:
@@ -92,8 +94,8 @@ def __call__(
                 if self.detect_orientation
                 else [estimate_orientation(seq_map) for seq_map in seq_maps]
             )
-            pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)]
-            # forward again to get predictions on straight pages
+            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
 
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore
@@ -134,24 +136,9 @@ def __call__(
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes_per_page = [
-                {
-                    k: rotate_boxes(
-                        page_boxes,
-                        angle,
-                        orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:],
-                        target_shape=mask,  # type: ignore[arg-type]
-                    )
-                    for k, page_boxes in page_boxes_dict.items()
-                }
-                for page_boxes_dict, page, angle, mask in zip(
-                    boxes_per_page, pages, origin_page_orientations, origin_page_shapes
-                )
-            ]
 
         out = self.doc_builder(
+            pages,
             boxes_per_page,
             text_preds_per_page,
             origin_page_shapes,  # type: ignore[arg-type]

diff --git a/doctr/models/predictor/pytorch.py b/doctr/models/predictor/pytorch.py
@@ -13,7 +13,7 @@
 from doctr.models._utils import estimate_orientation, get_language
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 
 from .base import _OCRPredictor
 
@@ -90,10 +90,10 @@ def forward(
                 else [estimate_orientation(seq_map) for seq_map in seq_maps]
             )
             pages = [
-                rotate_image(page, -angle, expand=True)  # type: ignore[arg-type]
+                rotate_image(page, -angle, expand=False)  # type: ignore[arg-type]
                 for page, angle in zip(pages, origin_page_orientations)
             ]
-            # forward again to get predictions on straight pages
+            # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
 
         assert all(
@@ -127,24 +127,12 @@ def forward(
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes = [
-                rotate_boxes(
-                    page_boxes,
-                    angle,
-                    orig_shape=page.shape[:2]
-                    if isinstance(page, np.ndarray)
-                    else page.shape[1:],  # type: ignore[arg-type]
-                    target_shape=mask,  # type: ignore[arg-type]
-                )
-                for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes)
-            ]
 
         out = self.doc_builder(
+            pages,  # type: ignore[arg-type]
             boxes,
             text_preds,
-            [page.shape[:2] if channels_last else page.shape[-2:] for page in pages],  # type: ignore[misc]
+            origin_page_shapes,  # type: ignore[arg-type]
             orientations,
             languages_dict,
         )

diff --git a/doctr/models/predictor/tensorflow.py b/doctr/models/predictor/tensorflow.py
@@ -12,7 +12,7 @@
 from doctr.models._utils import estimate_orientation, get_language
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 from doctr.utils.repr import NestedObject
 
 from .base import _OCRPredictor
@@ -89,7 +89,7 @@ def __call__(
                 if self.detect_orientation
                 else [estimate_orientation(seq_map) for seq_map in seq_maps]
             )
-            pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)]
+            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
             # forward again to get predictions on straight pages
             loc_preds_dict = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
 
@@ -121,19 +121,9 @@ def __call__(
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes = [
-                rotate_boxes(
-                    page_boxes,
-                    angle,
-                    orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:],
-                    target_shape=mask,  # type: ignore[arg-type]
-                )
-                for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes)
-            ]
 
         out = self.doc_builder(
+            pages,
             boxes,
             text_preds,
             origin_page_shapes,  # type: ignore[arg-type]