mindee · sarjil77 · Dec 3, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 4, 2024
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -42,7 +42,7 @@ jobs:
           ssh-private-key: ${{ secrets.SSH_DEPLOY_KEY }}
 
       - name: Deploy to Github Pages
-        uses: JamesIves/[email protected].1
+        uses: JamesIves/[email protected].2
         with:
           BRANCH: gh-pages
           FOLDER: 'docs/build'

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
       - id: no-commit-to-branch
         args: ['--branch', 'main']
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.4
+    rev: v0.8.1
     hooks:
       - id: ruff
         args: [ --fix ]

diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst
@@ -169,17 +169,20 @@ of vocabs.
      - 115
      - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴
    * - vietnamese
-     - 236
-     - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ
+     - 234
+     - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ
    * - hebrew
      - 123
      - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪
    * - hindi
-     - 71
-     - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰
+     - 68
+     - अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥
    * - bangla
      - 70
      - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯
+   * - gujarati
+     - 107
+     - અઆઇઈઉઊઋએઐઓઔઅંઅઃકખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ૦૧૨૩૪૫૬૭૮૯!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~૰ઽ◌ંઃ॥ૐ઼ ઁ૱
    * - multilingual
      - 195
      - english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & §
diff --git a/docs/source/using_doctr/using_model_export.rst b/docs/source/using_doctr/using_model_export.rst
@@ -31,7 +31,11 @@ Advantages:
         .. code:: python3
 
             import torch
-            predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True).cuda().half()
+            predictor = ocr_predictor(
+                reco_arch="crnn_mobilenet_v3_small",
+                det_arch="linknet_resnet34",
+                pretrained=True
+            ).cuda().half()
             res = predictor(doc)
 
     .. tab:: TensorFlow
@@ -41,8 +45,63 @@ Advantages:
             import tensorflow as tf
             from tensorflow.keras import mixed_precision
             mixed_precision.set_global_policy('mixed_float16')
-            predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
-
+            predictor = ocr_predictor(
+                reco_arch="crnn_mobilenet_v3_small",
+                det_arch="linknet_resnet34",
+                pretrained=True
+            )
+
+
+Compiling your models (PyTorch only)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**NOTE:**
+
+- This feature is only available if you use PyTorch as backend.
+- The recognition architecture `master` is not supported for model compilation yet.
+- We provide only official support for the default (`inductor`) backend, but you can try other backends, configurations depending on your hardware and requirements as well.
+
+Compiling your PyTorch models with `torch.compile` optimizes the model by converting it to a graph representation and applying backends that can improve performance.
+This process can make inference faster and reduce memory overhead during execution.
+
+Further information can be found in the `PyTorch documentation <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_.
+
+.. code::
+
+    import torch
+    from doctr.models import (
+        ocr_predictor,
+        vitstr_small,
+        fast_base,
+        mobilenet_v3_small_crop_orientation,
+        mobilenet_v3_small_page_orientation,
+        crop_orientation_predictor,
+        page_orientation_predictor
+    )
+
+    # Compile the models
+    detection_model = torch.compile(
+        fast_base(pretrained=True).eval()
+    )
+    recognition_model = torch.compile(
+        vitstr_small(pretrained=True).eval()
+    )
+    crop_orientation_model = torch.compile(
+        mobilenet_v3_small_crop_orientation(pretrained=True).eval()
+    )
+    page_orientation_model = torch.compile(
+        mobilenet_v3_small_page_orientation(pretrained=True).eval()
+    )
+
+    predictor = models.ocr_predictor(
+        detection_model, recognition_model, assume_straight_pages=False
+    )
+    # NOTE: Only required for non-straight pages (`assume_straight_pages=False`) and non-disabled orientation classification
+    # Set the orientation predictors
+    predictor.crop_orientation_predictor = crop_orientation_predictor(crop_orientation_model)
+    predictor.page_orientation_predictor = page_orientation_predictor(page_orientation_model)
+
+    compiled_out = predictor(doc)
 
 Export to ONNX
 ^^^^^^^^^^^^^^
@@ -64,7 +123,11 @@ It defines a common format for representing models, including the network struct
             input_shape = (3, 32, 128)
             model = vitstr_small(pretrained=True, exportable=True)
             dummy_input = torch.rand((batch_size, input_shape), dtype=torch.float32)
-            model_path = export_model_to_onnx(model, model_name="vitstr.onnx, dummy_input=dummy_input)
+            model_path = export_model_to_onnx(
+                model,
+                model_name="vitstr.onnx",
+                dummy_input=dummy_input
+            )
 
     .. tab:: TensorFlow
 
@@ -78,7 +141,11 @@ It defines a common format for representing models, including the network struct
             input_shape = (32, 128, 3)
             model = vitstr_small(pretrained=True, exportable=True)
             dummy_input = [tf.TensorSpec([batch_size, input_shape], tf.float32, name="input")]
-            model_path, output = export_model_to_onnx(model, model_name="vitstr.onnx", dummy_input=dummy_input)
+            model_path, output = export_model_to_onnx(
+                model,
+                model_name="vitstr.onnx",
+                dummy_input=dummy_input
+            )
 
 
 Using your ONNX exported model

diff --git a/docs/source/using_doctr/using_models.rst b/docs/source/using_doctr/using_models.rst
@@ -298,7 +298,7 @@ For instance, this snippet instantiates an end-to-end ocr_predictor working with
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
     model = ocr_predictor('linknet_resnet18', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True)
 
 
@@ -309,7 +309,7 @@ Additionally, you can change the batch size of the underlying detection and reco
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
     model = ocr_predictor(pretrained=True, det_bs=4, reco_bs=1024)
 
 To modify the output structure you can pass the following arguments to the predictor which will be handled by the underlying `DocumentBuilder`:
@@ -322,7 +322,7 @@ For example to disable the automatic grouping of lines into blocks:
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
     model = ocr_predictor(pretrained=True, resolve_blocks=False)
 
 
@@ -477,7 +477,7 @@ This will only have an effect with `assume_straight_pages=False` and/or `straigh
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
     model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_page_orientation=True)
 
 
@@ -489,15 +489,15 @@ This will only have an effect with `assume_straight_pages=False` and/or `straigh
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
     model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_crop_orientation=True)
 
 
 * Add a hook to the `ocr_predictor` to manipulate the location predictions before the crops are passed to the recognition model.
 
 .. code:: python3
 
-    from doctr.model import ocr_predictor
+    from doctr.models import ocr_predictor
 
     class CustomHook:
         def __call__(self, loc_preds):

diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py
@@ -19,9 +19,13 @@
     "arabic_digits": "٠١٢٣٤٥٦٧٨٩",
     "arabic_diacritics": "ًٌٍَُِّْ",
     "arabic_punctuation": "؟؛«»—",
-    "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
+    "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
     "hindi_digits": "०१२३४५६७८९",
-    "hindi_punctuation": "।,?!:्ॐ॰॥॰",
+    "hindi_punctuation": "।,?!:्ॐ॰॥",
+    "gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ",
+    "gujarati_consonants":"કખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ",
+    "gujarati_digits":"૦૧૨૩૪૫૬૭૮૯",
+    "gujarati_punctuation": "૰ઽ◌ંઃ॥ૐ઼ઁ" + "૱",
     "bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
     "bangla_digits": "০১২৩৪৫৬৭৮৯",
     "generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ",
@@ -53,11 +57,18 @@
 VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ"
 VOCABS["vietnamese"] = (
     VOCABS["english"]
-    + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ"
-    + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
+    + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵ"
+    + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ"
 )
 VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
 VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
+VOCABS['gujarati'] = (
+    VOCABS['gujarati_consonants']
+    + VOCABS['gujarati_vowels']
+    + VOCABS['gujarati_digits']
+    + VOCABS['gujarati_punctuation']
+    + VOCABS['punctuation']
+)
 VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
 VOCABS["ukrainian"] = (
     VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"

diff --git a/doctr/models/classification/zoo.py b/doctr/models/classification/zoo.py
@@ -5,7 +5,7 @@
 
 from typing import Any
 
-from doctr.file_utils import is_tf_available
+from doctr.file_utils import is_tf_available, is_torch_available
 
 from .. import classification
 from ..preprocessor import PreProcessor
@@ -48,7 +48,14 @@ def _orientation_predictor(
         # Load directly classifier from backbone
         _model = classification.__dict__[arch](pretrained=pretrained)
     else:
-        if not isinstance(arch, classification.MobileNetV3):
+        allowed_archs = [classification.MobileNetV3]
+        if is_torch_available():
+            # Adding the type for torch compiled models to the allowed architectures
+            from doctr.models.utils import _CompiledModule
+
+            allowed_archs.append(_CompiledModule)
+
+        if not isinstance(arch, tuple(allowed_archs)):
             raise ValueError(f"unknown architecture: {type(arch)}")
         _model = arch
 

diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py
@@ -205,11 +205,16 @@ def forward(
             out["out_map"] = prob_map
 
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(prob_map: torch.Tensor) -> list[dict[str, Any]]:
+                return [
+                    dict(zip(self.class_names, preds))
+                    for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
+                ]
+
             # Post-process boxes (keep only text predictions)
-            out["preds"] = [
-                dict(zip(self.class_names, preds))
-                for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
-            ]
+            out["preds"] = _postprocess(prob_map)
 
         if target is not None:
             thresh_map = self.thresh_head(feat_concat)

diff --git a/doctr/models/detection/fast/pytorch.py b/doctr/models/detection/fast/pytorch.py
@@ -196,11 +196,16 @@ def forward(
             out["out_map"] = prob_map
 
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(prob_map: torch.Tensor) -> list[dict[str, Any]]:
+                return [
+                    dict(zip(self.class_names, preds))
+                    for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
+                ]
+
             # Post-process boxes (keep only text predictions)
-            out["preds"] = [
-                dict(zip(self.class_names, preds))
-                for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
-            ]
+            out["preds"] = _postprocess(prob_map)
 
         if target is not None:
             loss = self.compute_loss(logits, target)

diff --git a/doctr/models/detection/linknet/pytorch.py b/doctr/models/detection/linknet/pytorch.py
@@ -183,11 +183,16 @@ def forward(
             out["out_map"] = prob_map
 
         if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = [
-                dict(zip(self.class_names, preds))
-                for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
-            ]
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(prob_map: torch.Tensor) -> list[dict[str, Any]]:
+                return [
+                    dict(zip(self.class_names, preds))
+                    for preds in self.postprocessor(prob_map.detach().cpu().permute((0, 2, 3, 1)).numpy())
+                ]
+
+            # Post-process boxes (keep only text predictions)
+            out["preds"] = _postprocess(prob_map)
 
         if target is not None:
             loss = self.compute_loss(logits, target)

diff --git a/doctr/models/detection/zoo.py b/doctr/models/detection/zoo.py
@@ -56,7 +56,14 @@ def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True,
         if isinstance(_model, detection.FAST):
             _model = reparameterize(_model)
     else:
-        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+        allowed_archs = [detection.DBNet, detection.LinkNet, detection.FAST]
+        if is_torch_available():
+            # Adding the type for torch compiled models to the allowed architectures
+            from doctr.models.utils import _CompiledModule
+
+            allowed_archs.append(_CompiledModule)
+
+        if not isinstance(arch, tuple(allowed_archs)):
             raise ValueError(f"unknown architecture: {type(arch)}")
 
         _model = arch

diff --git a/doctr/models/recognition/crnn/pytorch.py b/doctr/models/recognition/crnn/pytorch.py
@@ -213,8 +213,13 @@ def forward(
             out["out_map"] = logits
 
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(logits)
+
             # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
+            out["preds"] = _postprocess(logits)
 
         if target is not None:
             out["loss"] = self.compute_loss(logits, target)

diff --git a/doctr/models/recognition/master/pytorch.py b/doctr/models/recognition/master/pytorch.py
@@ -209,7 +209,13 @@ def forward(
             out["out_map"] = logits
 
         if return_preds:
-            out["preds"] = self.postprocessor(logits)
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(logits)
+
+            # Post-process boxes
+            out["preds"] = _postprocess(logits)
 
         return out
 

diff --git a/doctr/models/recognition/parseq/pytorch.py b/doctr/models/recognition/parseq/pytorch.py
@@ -372,8 +372,13 @@ def forward(
             out["out_map"] = logits
 
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(logits)
+
             # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
+            out["preds"] = _postprocess(logits)
 
         if target is not None:
             out["loss"] = loss