Skip to content

Commit

Permalink
[skip ci] some updates and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 committed Oct 4, 2023
1 parent 92f939e commit 1edd231
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 107 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ If both options are set to False, the predictor will always fit and return rotat
To interpret your model's predictions, you can visualize them interactively as follows:

```python
result.show(doc)
result.show()
```

![Visualization sample](docs/images/doctr_example_script.gif)
Expand Down
30 changes: 16 additions & 14 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ class Page(Element):
"""Implements a page element as a collection of blocks
Args:
page: image encoded as a numpy array in uint8
blocks: list of block elements
page_idx: the index of the page in the input raw document
dimensions: the page size in pixels in format (height, width)
Expand All @@ -244,13 +245,15 @@ class Page(Element):

def __init__(
self,
page: np.ndarray,
blocks: List[Block],
page_idx: int,
dimensions: Tuple[int, int],
orientation: Optional[Dict[str, Any]] = None,
language: Optional[Dict[str, Any]] = None,
) -> None:
super().__init__(blocks=blocks)
self.page = page
self.page_idx = page_idx
self.dimensions = dimensions
self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
Expand All @@ -263,15 +266,14 @@ def render(self, block_break: str = "\n\n") -> str:
def extra_repr(self) -> str:
return f"dimensions={self.dimensions}"

def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
"""Overlay the result on a given image
Args:
page: image encoded as a numpy array in uint8
interactive: whether the display should be interactive
preserve_aspect_ratio: pass True if you passed True to the predictor
"""
visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
plt.show(**kwargs)

def synthesize(self, **kwargs) -> np.ndarray:
Expand Down Expand Up @@ -399,6 +401,7 @@ class KIEPage(Element):
Args:
predictions: Dictionary with list of block elements for each detection class
page: image encoded as a numpy array in uint8
page_idx: the index of the page in the input raw document
dimensions: the page size in pixels in format (height, width)
orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
Expand All @@ -411,13 +414,15 @@ class KIEPage(Element):

def __init__(
self,
page: np.ndarray,
predictions: Dict[str, List[Prediction]],
page_idx: int,
dimensions: Tuple[int, int],
orientation: Optional[Dict[str, Any]] = None,
language: Optional[Dict[str, Any]] = None,
) -> None:
super().__init__(predictions=predictions)
self.page = page
self.page_idx = page_idx
self.dimensions = dimensions
self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
Expand All @@ -432,15 +437,16 @@ def render(self, prediction_break: str = "\n\n") -> str:
def extra_repr(self) -> str:
return f"dimensions={self.dimensions}"

def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
"""Overlay the result on a given image
Args:
page: image encoded as a numpy array in uint8
interactive: whether the display should be interactive
preserve_aspect_ratio: pass True if you passed True to the predictor
"""
visualize_kie_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
visualize_kie_page(
self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
)
plt.show(**kwargs)

def synthesize(self, **kwargs) -> np.ndarray:
Expand Down Expand Up @@ -543,14 +549,10 @@ def render(self, page_break: str = "\n\n\n\n") -> str:
"""Renders the full text of the element"""
return page_break.join(p.render() for p in self.pages)

def show(self, pages: List[np.ndarray], **kwargs) -> None:
"""Overlay the result on a given image
Args:
pages: list of images encoded as numpy arrays in uint8
"""
for img, result in zip(pages, self.pages):
result.show(img, **kwargs)
def show(self, **kwargs) -> None:
"""Overlay the result on a given image"""
for result in self.pages:
result.show(**kwargs)

def synthesize(self, **kwargs) -> List[np.ndarray]:
"""Synthesize all pages from their predictions
Expand Down
14 changes: 10 additions & 4 deletions doctr/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def extra_repr(self) -> str:

def __call__(
self,
pages: List[np.ndarray],
boxes: List[np.ndarray],
text_preds: List[List[Tuple[str, float]]],
page_shapes: List[Tuple[int, int]],
Expand All @@ -289,6 +290,7 @@ def __call__(
"""Re-arrange detected words into structured blocks
Args:
pages: list of N elements, where each element represents the page image
boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
or (*, 6) for all words for a given page
text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
Expand Down Expand Up @@ -316,6 +318,7 @@ def __call__(

_pages = [
Page(
page,
self._build_blocks(
page_boxes,
word_preds,
Expand All @@ -325,8 +328,8 @@ def __call__(
orientation,
language,
)
for _idx, shape, page_boxes, word_preds, orientation, language in zip(
range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
)
]

Expand All @@ -346,6 +349,7 @@ class KIEDocumentBuilder(DocumentBuilder):

def __call__( # type: ignore[override]
self,
pages: List[np.ndarray],
boxes: List[Dict[str, np.ndarray]],
text_preds: List[Dict[str, List[Tuple[str, float]]]],
page_shapes: List[Tuple[int, int]],
Expand All @@ -355,6 +359,7 @@ def __call__( # type: ignore[override]
"""Re-arrange detected words into structured predictions
Args:
pages: list of N elements, where each element represents the page image
boxes: list of N dictionaries, where each element represents the localization predictions for a class,
of shape (*, 5) or (*, 6) for all predictions
text_preds: list of N dictionaries, where each element is the list of all word prediction
Expand Down Expand Up @@ -384,6 +389,7 @@ def __call__( # type: ignore[override]

_pages = [
KIEPage(
page,
{
k: self._build_blocks(
page_boxes[k],
Expand All @@ -396,8 +402,8 @@ def __call__( # type: ignore[override]
orientation,
language,
)
for _idx, shape, page_boxes, word_preds, orientation, language in zip(
range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
)
]

Expand Down
31 changes: 8 additions & 23 deletions doctr/models/kie_predictor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import rotate_boxes, rotate_image
from doctr.utils.geometry import rotate_image

from .base import _KIEPredictor

Expand Down Expand Up @@ -76,7 +76,9 @@ def forward(

# Detect document rotation and rotate pages
seq_maps = [
np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8)
np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
np.uint8
)
for out_map in out_maps
]
if self.detect_orientation:
Expand All @@ -93,10 +95,10 @@ def forward(
else [estimate_orientation(seq_map) for seq_map in seq_maps]
)
pages = [
rotate_image(page, -angle, expand=True) # type: ignore[arg-type]
rotate_image(page, -angle, expand=False) # type: ignore[arg-type]
for page, angle in zip(pages, origin_page_orientations)
]
# forward again to get predictions on straight pages
# Forward again to get predictions on straight pages
loc_preds = self.det_predictor(pages, **kwargs)

dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore[assignment]
Expand Down Expand Up @@ -143,29 +145,12 @@ def forward(
languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
else:
languages_dict = None
# Rotate back pages and boxes while keeping original image size
if self.straighten_pages:
boxes_per_page = [
{
k: rotate_boxes(
page_boxes,
angle,
orig_shape=page.shape[:2]
if isinstance(page, np.ndarray)
else page.shape[1:], # type: ignore[arg-type]
target_shape=mask, # type: ignore[arg-type]
)
for k, page_boxes in page_boxes_dict.items()
}
for page_boxes_dict, page, angle, mask in zip(
boxes_per_page, pages, origin_page_orientations, origin_page_shapes
)
]

out = self.doc_builder(
pages, # type: ignore[arg-type]
boxes_per_page,
text_preds_per_page,
[page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc]
origin_page_shapes, # type: ignore[arg-type]
orientations,
languages_dict,
)
Expand Down
29 changes: 8 additions & 21 deletions doctr/models/kie_predictor/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import rotate_boxes, rotate_image
from doctr.utils.geometry import rotate_image
from doctr.utils.repr import NestedObject

from .base import _KIEPredictor
Expand Down Expand Up @@ -72,11 +72,13 @@ def __call__(
origin_page_shapes = [page.shape[:2] for page in pages]

# Localize text elements
loc_preds, out_maps = self.det_predictor(pages, return_preds=True, **kwargs)
loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)

# Detect document rotation and rotate pages
seq_maps = [
np.sum(np.where(out_map > kwargs.get("bin_thresh", 0.3), 255, 0), axis=-1, keepdims=True).astype(np.uint8)
np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
np.uint8
)
for out_map in out_maps
]
if self.detect_orientation:
Expand All @@ -92,8 +94,8 @@ def __call__(
if self.detect_orientation
else [estimate_orientation(seq_map) for seq_map in seq_maps]
)
pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)]
# forward again to get predictions on straight pages
pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
# Forward again to get predictions on straight pages
loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment]

dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds) # type: ignore
Expand Down Expand Up @@ -134,24 +136,9 @@ def __call__(
languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
else:
languages_dict = None
# Rotate back pages and boxes while keeping original image size
if self.straighten_pages:
boxes_per_page = [
{
k: rotate_boxes(
page_boxes,
angle,
orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:],
target_shape=mask, # type: ignore[arg-type]
)
for k, page_boxes in page_boxes_dict.items()
}
for page_boxes_dict, page, angle, mask in zip(
boxes_per_page, pages, origin_page_orientations, origin_page_shapes
)
]

out = self.doc_builder(
pages,
boxes_per_page,
text_preds_per_page,
origin_page_shapes, # type: ignore[arg-type]
Expand Down
22 changes: 5 additions & 17 deletions doctr/models/predictor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from doctr.models._utils import estimate_orientation, get_language
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import rotate_boxes, rotate_image
from doctr.utils.geometry import rotate_image

from .base import _OCRPredictor

Expand Down Expand Up @@ -90,10 +90,10 @@ def forward(
else [estimate_orientation(seq_map) for seq_map in seq_maps]
)
pages = [
rotate_image(page, -angle, expand=True) # type: ignore[arg-type]
rotate_image(page, -angle, expand=False) # type: ignore[arg-type]
for page, angle in zip(pages, origin_page_orientations)
]
# forward again to get predictions on straight pages
# Forward again to get predictions on straight pages
loc_preds = self.det_predictor(pages, **kwargs)

assert all(
Expand Down Expand Up @@ -127,24 +127,12 @@ def forward(
languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
else:
languages_dict = None
# Rotate back pages and boxes while keeping original image size
if self.straighten_pages:
boxes = [
rotate_boxes(
page_boxes,
angle,
orig_shape=page.shape[:2]
if isinstance(page, np.ndarray)
else page.shape[1:], # type: ignore[arg-type]
target_shape=mask, # type: ignore[arg-type]
)
for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes)
]

out = self.doc_builder(
pages, # type: ignore[arg-type]
boxes,
text_preds,
[page.shape[:2] if channels_last else page.shape[-2:] for page in pages], # type: ignore[misc]
origin_page_shapes, # type: ignore[arg-type]
orientations,
languages_dict,
)
Expand Down
16 changes: 3 additions & 13 deletions doctr/models/predictor/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from doctr.models._utils import estimate_orientation, get_language
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import rotate_boxes, rotate_image
from doctr.utils.geometry import rotate_image
from doctr.utils.repr import NestedObject

from .base import _OCRPredictor
Expand Down Expand Up @@ -89,7 +89,7 @@ def __call__(
if self.detect_orientation
else [estimate_orientation(seq_map) for seq_map in seq_maps]
)
pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)]
pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
# forward again to get predictions on straight pages
loc_preds_dict = self.det_predictor(pages, **kwargs) # type: ignore[assignment]

Expand Down Expand Up @@ -121,19 +121,9 @@ def __call__(
languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
else:
languages_dict = None
# Rotate back pages and boxes while keeping original image size
if self.straighten_pages:
boxes = [
rotate_boxes(
page_boxes,
angle,
orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:],
target_shape=mask, # type: ignore[arg-type]
)
for page_boxes, page, angle, mask in zip(boxes, pages, origin_page_orientations, origin_page_shapes)
]

out = self.doc_builder(
pages,
boxes,
text_preds,
origin_page_shapes, # type: ignore[arg-type]
Expand Down
Loading

0 comments on commit 1edd231

Please sign in to comment.