From c16f86094ea58abefb9bf8b58725e554b2118e8d Mon Sep 17 00:00:00 2001
From: Bhavya Patel <bhavya.patel@injala.com>
Date: Fri, 15 Nov 2024 15:19:56 +0530
Subject: [PATCH 1/4] adding pytorch DDP  script of detection task

---
 references/detection/train_pytorch_DDP.py | 546 ++++++++++++++++++++++
 1 file changed, 546 insertions(+)
 create mode 100644 references/detection/train_pytorch_DDP.py

diff --git a/references/detection/train_pytorch_DDP.py b/references/detection/train_pytorch_DDP.py
new file mode 100644
index 0000000000..1da370d9e1
--- /dev/null
+++ b/references/detection/train_pytorch_DDP.py
@@ -0,0 +1,546 @@
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import os
+
+os.environ["USE_TORCH"] = "1"
+
+import datetime
+import hashlib
+import logging
+import multiprocessing 
+import time
+
+import numpy as np
+import torch
+import wandb
+from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torchvision.transforms.v2 import Compose, GaussianBlur, Normalize, RandomGrayscale, RandomPhotometricDistort
+from tqdm.auto import tqdm
+# using this for DDP setup
+import torch.distributed as dist 
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+
+from doctr import transforms as T
+from doctr.datasets import DetectionDataset
+from doctr.models import detection, login_to_hub, push_to_hf_hub
+from doctr.utils.metrics import LocalizationConfusion
+from utils import EarlyStopper, plot_recorder, plot_samples
+
+
+def record_lr(
+    model: torch.nn.Module,
+    train_loader: DataLoader,
+    batch_transforms,
+    optimizer,
+    start_lr: float = 1e-7,
+    end_lr: float = 1,
+    num_it: int = 100,
+    amp: bool = False,
+):
+    """Gridsearch the optimal learning rate for the training.
+    Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py
+    """
+    if num_it > len(train_loader):
+        raise ValueError("the value of `num_it` needs to be lower than the number of available batches")
+
+    model = model.train()
+    # Update param groups & LR
+    optimizer.defaults["lr"] = start_lr
+    for pgroup in optimizer.param_groups:
+        pgroup["lr"] = start_lr
+
+    gamma = (end_lr / start_lr) ** (1 / (num_it - 1))
+    scheduler = MultiplicativeLR(optimizer, lambda step: gamma)
+
+    lr_recorder = [start_lr * gamma**idx for idx in range(num_it)]
+    loss_recorder = []
+
+    if amp:
+        scaler = torch.cuda.amp.GradScaler()
+
+    for batch_idx, (images, targets) in enumerate(train_loader):
+        if torch.cuda.is_available():
+            images = images.cuda()
+
+        images = batch_transforms(images)
+
+        # Forward, Backward & update
+        optimizer.zero_grad()
+        if amp:
+            with torch.cuda.amp.autocast():
+                train_loss = model(images, targets)["loss"]
+            scaler.scale(train_loss).backward()
+            # Gradient clipping
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            # Update the params
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            train_loss = model(images, targets)["loss"]
+            train_loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            optimizer.step()
+        # Update LR
+        scheduler.step()
+
+        # Record
+        if not torch.isfinite(train_loss):
+            if batch_idx == 0:
+                raise ValueError("loss value is NaN or inf.")
+            else:
+                break
+        loss_recorder.append(train_loss.item())
+        # Stop after the number of iterations
+        if batch_idx + 1 == num_it:
+            break
+
+    return lr_recorder[: len(loss_recorder)], loss_recorder
+
+
+def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False):
+    if amp:
+        scaler = torch.cuda.amp.GradScaler()
+
+    model.train()
+    # Iterate over the batches of the dataset
+    pbar = tqdm(train_loader, position=1)
+    for images, targets in pbar:
+        if torch.cuda.is_available():
+            images = images.cuda()
+        images = batch_transforms(images)
+
+        optimizer.zero_grad()
+        if amp:
+            with torch.cuda.amp.autocast():
+                train_loss = model(images, targets)["loss"]
+            scaler.scale(train_loss).backward()
+            # Gradient clipping
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            # Update the params
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            train_loss = model(images, targets)["loss"]
+            train_loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            optimizer.step()
+
+        scheduler.step()
+
+        pbar.set_description(f"Training loss: {train_loss.item():.6}")
+
+
+@torch.no_grad()
+def evaluate(model, val_loader, batch_transforms, val_metric,args, amp=False):
+    # Model in eval mode
+    model.eval()
+    # Reset val metric
+    val_metric.reset()
+    # Validation loop
+    val_loss, batch_cnt = 0, 0
+    for images, targets in tqdm(val_loader):
+        if torch.cuda.is_available():
+            images = images.cuda()
+        images = batch_transforms(images)
+        if amp:
+            with torch.cuda.amp.autocast():
+                out = model(images, targets, return_preds=True)
+        else:
+            out = model(images, targets, return_preds=True)
+        # Compute metric
+        loc_preds = out["preds"]
+        for target, loc_pred in zip(targets, loc_preds):
+            for boxes_gt, boxes_pred in zip(target.values(), loc_pred.values()):
+                if args.rotation and args.eval_straight:
+                    # Convert pred to boxes [xmin, ymin, xmax, ymax]  N, 5, 2 (with scores) --> N, 4
+                    boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1)
+                val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4])
+
+        val_loss += out["loss"].item()
+        batch_cnt += 1
+
+    val_loss /= batch_cnt
+    recall, precision, mean_iou = val_metric.summary()
+    return val_loss, recall, precision, mean_iou
+
+
+def main(rank:int, world_size:int, args):
+    
+    """
+        rank(int) : it is the unique identifier to each process and you can also say that it is your device id 
+        world_size(int) : total number of processes 
+    """
+    
+    print(args)
+
+    if rank == 0 and args.push_to_hub:
+        login_to_hub()
+
+    if not isinstance(args.workers, int):
+        args.workers = min(16, mp.cpu_count())
+
+    torch.backends.cudnn.benchmark = True
+
+    if rank == 0:
+        # validation dataset realted code
+        st = time.time()
+        val_set = DetectionDataset(
+            img_folder=os.path.join(args.val_path, "images"),
+            label_path=os.path.join(args.val_path, "labels.json"),
+            sample_transforms=T.SampleCompose(
+                (
+                    [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
+                    if not args.rotation or args.eval_straight
+                    else []
+                )
+                + (
+                    [
+                        T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
+                        T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+                        T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+                    ]
+                    if args.rotation and not args.eval_straight
+                    else []
+                )
+            ),
+            use_polygons=args.rotation and not args.eval_straight,
+        )
+        val_loader = DataLoader(
+            val_set,
+            batch_size=args.batch_size,
+            drop_last=False,
+            num_workers=args.workers,
+            sampler=SequentialSampler(val_set),
+            # sampler = DistributedSampler(val_set, num_replicas = world_size, rank = rank, shuffle = False, drop_last = True),
+            pin_memory=torch.cuda.is_available(),
+            collate_fn=val_set.collate_fn,
+        )
+        print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)")
+        with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
+            val_hash = hashlib.sha256(f.read()).hexdigest()
+        
+        class_names = val_set.class_names
+    else:
+        class_names = None
+
+    batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
+
+    # loadign the doctr model
+    model = detection.__dict__[args.arch](
+        pretrained=args.pretrained,
+        assume_straight_pages=not args.rotation,
+        class_names=class_names,
+    )
+
+    # Resume weights
+    if isinstance(args.resume, str):
+        print(f"Resuming {args.resume}")
+        checkpoint = torch.load(args.resume, map_location="cpu")
+        model.load_state_dict(checkpoint)
+
+    # # GPU
+    # if isinstance(args.device, int):
+    #     if not torch.cuda.is_available():
+    #         raise AssertionError("PyTorch cannot access your GPU. Please investigate!")
+    #     if args.device >= torch.cuda.device_count():
+    #         raise ValueError("Invalid device index")
+    # # Silent default switch to GPU if available
+    # elif torch.cuda.is_available():
+    #     args.device = 0
+    # else:
+    #     logging.warning("No accessible GPU, target device set to CPU.")
+    # if torch.cuda.is_available():
+    #     torch.cuda.set_device(args.device)
+    #     model = model.cuda()
+    
+    # creating the process group
+    
+    device = torch.device('cuda', args.devices[rank])
+    dist.init_process_group(args.backend, rank = rank, world_size = world_size)
+    print('1. init_process_group initialized')
+    
+    model = model.to(device)
+    
+    # wrapping the model around DDP 
+    model = DDP(model, device_ids = [device])
+    print('2. wrapped the model around DDP')
+
+    if rank == 0:
+        # Metrics
+        val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight)
+
+    if rank == 0 and args.test_only:
+        print("Running evaluation")
+        val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric,args, amp=args.amp)
+        print(
+            f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | "
+            f"Mean IoU: {mean_iou:.2%})"
+        )
+        return
+
+    st = time.time()
+    # Augmentations
+    # Image augmentations
+    img_transforms = T.OneOf([
+        Compose([
+            T.RandomApply(T.ColorInversion(), 0.3),
+            T.RandomApply(GaussianBlur(kernel_size=5, sigma=(0.1, 4)), 0.2),
+        ]),
+        Compose([
+            T.RandomApply(T.RandomShadow(), 0.3),
+            T.RandomApply(T.GaussianNoise(), 0.1),
+            T.RandomApply(GaussianBlur(kernel_size=5, sigma=(0.1, 4)), 0.3),
+            RandomGrayscale(p=0.15),
+        ]),
+        RandomPhotometricDistort(p=0.3),
+        lambda x: x,  # Identity no transformation
+    ])
+    
+    print('3. completed img_transforms')
+    # Image + target augmentations
+    sample_transforms = T.SampleCompose(
+        (
+            [
+                T.RandomHorizontalFlip(0.15),
+                T.OneOf([
+                    T.RandomApply(T.RandomCrop(ratio=(0.6, 1.33)), 0.25),
+                    T.RandomResize(scale_range=(0.4, 0.9), preserve_aspect_ratio=0.5, symmetric_pad=0.5, p=0.25),
+                ]),
+                T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+            ]
+            if not args.rotation
+            else [
+                T.RandomHorizontalFlip(0.15),
+                T.OneOf([
+                    T.RandomApply(T.RandomCrop(ratio=(0.6, 1.33)), 0.25),
+                    T.RandomResize(scale_range=(0.4, 0.9), preserve_aspect_ratio=0.5, symmetric_pad=0.5, p=0.25),
+                ]),
+                # Rotation augmentation
+                T.Resize(args.input_size, preserve_aspect_ratio=True),
+                T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+                T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+            ]
+        )
+    )
+    
+    print('4. completed sample transform')
+
+    # Load both train and val data generators
+    train_set = DetectionDataset(
+        img_folder=os.path.join(args.train_path, "images"),
+        label_path=os.path.join(args.train_path, "labels.json"),
+        img_transforms=img_transforms,
+        sample_transforms=sample_transforms,
+        use_polygons=args.rotation,
+    )
+
+    train_loader = DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        drop_last=True,
+        num_workers=args.workers,
+        # sampler=(train_set),
+        sampler = DistributedSampler(train_set, num_replicas = world_size, rank = rank, shuffle = False, drop_last = True),
+        pin_memory=torch.cuda.is_available(),
+        collate_fn=train_set.collate_fn,
+    )
+    print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches) along with DistributedSampler")
+    
+    with open(os.path.join(args.train_path, "labels.json"), "rb") as f:
+        train_hash = hashlib.sha256(f.read()).hexdigest()
+
+    if args.show_samples:
+        x, target = next(iter(train_loader))
+        plot_samples(x, target)
+        # return
+
+    # Backbone freezing
+    if args.freeze_backbone:
+        for p in model.feat_extractor.parameters():
+            p.requires_grad = False
+
+    # Optimizer
+    optimizer = torch.optim.Adam(
+        [p for p in model.parameters() if p.requires_grad],
+        args.lr,
+        betas=(0.95, 0.99),
+        eps=1e-6,
+        weight_decay=args.weight_decay,
+    )
+    # LR Finder
+    if args.find_lr:
+        lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp)
+        plot_recorder(lrs, losses)
+        return
+    # Scheduler
+    if args.sched == "cosine":
+        scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
+    elif args.sched == "onecycle":
+        scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
+    elif args.sched == "poly":
+        scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))
+
+    # Training monitoring
+    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
+
+    # W&B
+    if rank==0 and args.wb:
+        run = wandb.init(
+            name=exp_name,
+            project="text-detection",
+            config={
+                "learning_rate": args.lr,
+                "epochs": args.epochs,
+                "weight_decay": args.weight_decay,
+                "batch_size": args.batch_size,
+                "architecture": args.arch,
+                "input_size": args.input_size,
+                "optimizer": "adam",
+                "framework": "pytorch",
+                "scheduler": args.sched,
+                "train_hash": train_hash,
+                "val_hash": val_hash,
+                "pretrained": args.pretrained,
+                "rotation": args.rotation,
+                "amp": args.amp,
+            },
+        )
+
+    # Create loss queue
+    min_loss = np.inf
+    if args.early_stop:
+        early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
+
+    print('5. going inside training loop')
+    
+    # Training loop
+    for epoch in range(args.epochs):
+        fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp)
+        
+        if rank == 0:
+            # Validation loop at the end of each epoch
+            val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric,args, amp=args.amp)
+            if val_loss < min_loss:
+                print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+                torch.save(model.module.state_dict(), f"./{exp_name}.pt")
+                min_loss = val_loss
+            if args.save_interval_epoch:
+                print(f"Saving state at epoch: {epoch + 1}")
+                torch.save(model.state_dict(), f"./{exp_name}_epoch{epoch + 1}.pt")
+            log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} "
+            if any(val is None for val in (recall, precision, mean_iou)):
+                log_msg += "(Undefined metric value, caused by empty GTs or predictions)"
+            else:
+                log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})"
+            print(log_msg)
+            # W&B
+            if args.wb:
+                wandb.log({
+                    "val_loss": val_loss,
+                    "recall": recall,
+                    "precision": precision,
+                    "mean_iou": mean_iou,
+                })
+            if args.early_stop and early_stopper.early_stop(val_loss):
+                print("Training halted early due to reaching patience limit.")
+                break
+    print('6. coming out of training loop')       
+    if rank == 0:
+        if args.wb:
+            run.finish()
+
+        if args.push_to_hub:
+            push_to_hf_hub(model, exp_name, task="detection", run_config=args)
+    print('7. initializing destroy_process_group ')    
+    dist.destroy_process_group()
+    print('8. done initializing of destroy_process_group')
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="DocTR training script for text detection (PyTorch)",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # args realted to DDP
+    parser.add_argument('--backend', default='nccl', type=str, help='backend to use for torch DDP specifically it is for nvidia GPUs')
+
+    parser.add_argument("arch", type=str, help="text-detection model to train")
+    parser.add_argument("--train_path", type=str, required=True, help="path to training data folder")
+    parser.add_argument("--val_path", type=str, required=True, help="path to validation data folder")
+    parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
+    parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
+    parser.add_argument("--devices", default=None, nargs='+',type=int, help="GPU devices to use for training")
+    parser.add_argument(
+        "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch"
+    )
+    parser.add_argument("--input_size", type=int, default=1024, help="model input size, H = W")
+    parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
+    parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
+    parser.add_argument("-j", "--workers", type=int, default=0, help="number of workers used for dataloading")
+    parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
+    parser.add_argument("--test-only", dest="test_only", action="store_true", help="Run the validation loop")
+    parser.add_argument(
+        "--freeze-backbone", dest="freeze_backbone", action="store_true", help="freeze model backbone for fine-tuning"
+    )
+    parser.add_argument(
+        "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples"
+    )
+    parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases")
+    parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub")
+    parser.add_argument(
+        "--pretrained",
+        dest="pretrained",
+        action="store_true",
+        help="Load pretrained parameters before starting the training",
+    )
+    parser.add_argument("--rotation", dest="rotation", action="store_true", help="train with rotated documents")
+    parser.add_argument(
+        "--eval-straight",
+        action="store_true",
+        help="metrics evaluation with straight boxes instead of polygons to save time + memory",
+    )
+    parser.add_argument(
+        "--sched", type=str, default="poly", choices=["cosine", "onecycle", "poly"], help="scheduler to use"
+    )
+    parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
+    parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
+    parser.add_argument("--early-stop", action="store_true", help="Enable early stopping")
+    parser.add_argument("--early-stop-epochs", type=int, default=5, help="Patience for early stopping")
+    parser.add_argument("--early-stop-delta", type=float, default=0.01, help="Minimum Delta for early stopping")
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    
+    if not torch.cuda.is_available():
+        raise AssertionError('PyTorch cannot access your GPUs. please look into it bro !!!')
+    
+    if not isinstance(args.devices, list):
+        args.devices = list(range(torch.cuda.device_count()))
+        
+    # no of process per gpu
+    nprocs = len(args.devices)
+    
+    # setting up environ variables for DDP
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29500'
+    
+    mp.spawn(main, args=(nprocs, args), nprocs = nprocs, join=True)
+    
+  
\ No newline at end of file

From 8046d9f15598013234b9a5ecbfcc7a9679d1da6d Mon Sep 17 00:00:00 2001
From: Bhavya Patel <bhavya.patel@injala.com>
Date: Tue, 19 Nov 2024 13:17:39 +0530
Subject: [PATCH 2/4] added DDP script for detection task and updated its 
 corresponding README file.

---
 references/detection/README.md            | 21 +++++++
 references/detection/train_pytorch_DDP.py | 68 ++++++++---------------
 2 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/references/detection/README.md b/references/detection/README.md
index 35d1481877..d2bddf6b73 100644
--- a/references/detection/README.md
+++ b/references/detection/README.md
@@ -24,6 +24,27 @@ or PyTorch:
 ```shell
 python references/detection/train_pytorch.py db_resnet50 --train_path path/to/your/train_set --val_path path/to/your/val_set --epochs 5
 ```
+### Multi-GPU support (PyTorch only)
+
+Multi-GPU support on Detection task with PyTorch has been added. 
+Arguments are the same than the ones from single GPU, except:
+
+- `--devices`: **by default, if you do not pass `--devices`, it will use all GPUs on your computer**.
+You can use specific GPUs by passing a list of ids (ex: `0 1 2`). To find them, you can use the following snippet:
+
+```python
+import torch
+devices = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
+device_names = [torch.cuda.get_device_name(d) for d in devices]
+```
+
+- `--backend`: you can specify another `backend` for `DistribuedDataParallel` if the default one is not available on
+your operating system. Fastest one is `nccl` according to [PyTorch Documentation](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html).
+
+
+```shell
+python references/detection/train_pytorch_ddp.py db_resnet50 --train_path path/to/your/train_set --val_path path/to/your/val_set --epochs 5 --devices 0 1 --backend nccl
+```
 
 ## Data format
 
diff --git a/references/detection/train_pytorch_DDP.py b/references/detection/train_pytorch_DDP.py
index 1da370d9e1..3675a73eee 100644
--- a/references/detection/train_pytorch_DDP.py
+++ b/references/detection/train_pytorch_DDP.py
@@ -9,10 +9,7 @@
 
 import datetime
 import hashlib
-import logging
-import multiprocessing 
 import time
-
 import numpy as np
 import torch
 import wandb
@@ -20,7 +17,8 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torchvision.transforms.v2 import Compose, GaussianBlur, Normalize, RandomGrayscale, RandomPhotometricDistort
 from tqdm.auto import tqdm
-# using this for DDP setup
+
+# The following import is required for DDP
 import torch.distributed as dist 
 import torch.multiprocessing as mp
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -175,9 +173,13 @@ def evaluate(model, val_loader, batch_transforms, val_metric,args, amp=False):
 def main(rank:int, world_size:int, args):
     
     """
-        rank(int) : it is the unique identifier to each process and you can also say that it is your device id 
-        world_size(int) : total number of processes 
+        Args:
+        ----
+        rank (int): device id to put the model on
+        world_size (int): number of processes participating in the job
+        args: other arguments passed through the CLI
     """
+
     
     print(args)
 
@@ -190,7 +192,7 @@ def main(rank:int, world_size:int, args):
     torch.backends.cudnn.benchmark = True
 
     if rank == 0:
-        # validation dataset realted code
+        # validation dataset related code
         st = time.time()
         val_set = DetectionDataset(
             img_folder=os.path.join(args.val_path, "images"),
@@ -219,7 +221,6 @@ def main(rank:int, world_size:int, args):
             drop_last=False,
             num_workers=args.workers,
             sampler=SequentialSampler(val_set),
-            # sampler = DistributedSampler(val_set, num_replicas = world_size, rank = rank, shuffle = False, drop_last = True),
             pin_memory=torch.cuda.is_available(),
             collate_fn=val_set.collate_fn,
         )
@@ -233,7 +234,7 @@ def main(rank:int, world_size:int, args):
 
     batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
 
-    # loadign the doctr model
+    # Load DocTr model
     model = detection.__dict__[args.arch](
         pretrained=args.pretrained,
         assume_straight_pages=not args.rotation,
@@ -246,32 +247,14 @@ def main(rank:int, world_size:int, args):
         checkpoint = torch.load(args.resume, map_location="cpu")
         model.load_state_dict(checkpoint)
 
-    # # GPU
-    # if isinstance(args.device, int):
-    #     if not torch.cuda.is_available():
-    #         raise AssertionError("PyTorch cannot access your GPU. Please investigate!")
-    #     if args.device >= torch.cuda.device_count():
-    #         raise ValueError("Invalid device index")
-    # # Silent default switch to GPU if available
-    # elif torch.cuda.is_available():
-    #     args.device = 0
-    # else:
-    #     logging.warning("No accessible GPU, target device set to CPU.")
-    # if torch.cuda.is_available():
-    #     torch.cuda.set_device(args.device)
-    #     model = model.cuda()
-    
-    # creating the process group
-    
+    # create default process group
     device = torch.device('cuda', args.devices[rank])
     dist.init_process_group(args.backend, rank = rank, world_size = world_size)
-    print('1. init_process_group initialized')
-    
+    # create local model
     model = model.to(device)
-    
-    # wrapping the model around DDP 
+    # construct the DDP model  
     model = DDP(model, device_ids = [device])
-    print('2. wrapped the model around DDP')
+    
 
     if rank == 0:
         # Metrics
@@ -304,7 +287,6 @@ def main(rank:int, world_size:int, args):
         lambda x: x,  # Identity no transformation
     ])
     
-    print('3. completed img_transforms')
     # Image + target augmentations
     sample_transforms = T.SampleCompose(
         (
@@ -331,7 +313,6 @@ def main(rank:int, world_size:int, args):
         )
     )
     
-    print('4. completed sample transform')
 
     # Load both train and val data generators
     train_set = DetectionDataset(
@@ -420,7 +401,6 @@ def main(rank:int, world_size:int, args):
     if args.early_stop:
         early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
 
-    print('5. going inside training loop')
     
     # Training loop
     for epoch in range(args.epochs):
@@ -453,16 +433,14 @@ def main(rank:int, world_size:int, args):
             if args.early_stop and early_stopper.early_stop(val_loss):
                 print("Training halted early due to reaching patience limit.")
                 break
-    print('6. coming out of training loop')       
+         
     if rank == 0:
         if args.wb:
             run.finish()
 
         if args.push_to_hub:
             push_to_hf_hub(model, exp_name, task="detection", run_config=args)
-    print('7. initializing destroy_process_group ')    
-    dist.destroy_process_group()
-    print('8. done initializing of destroy_process_group')
+                
 
 
 def parse_args():
@@ -473,8 +451,8 @@ def parse_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
-    # args realted to DDP
-    parser.add_argument('--backend', default='nccl', type=str, help='backend to use for torch DDP specifically it is for nvidia GPUs')
+    # DDP related args
+    parser.add_argument('--backend', default='nccl', type=str, help='backend to use for torch DDP')
 
     parser.add_argument("arch", type=str, help="text-detection model to train")
     parser.add_argument("--train_path", type=str, required=True, help="path to training data folder")
@@ -527,20 +505,18 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
-    
     if not torch.cuda.is_available():
         raise AssertionError('PyTorch cannot access your GPUs. please look into it bro !!!')
     
     if not isinstance(args.devices, list):
-        args.devices = list(range(torch.cuda.device_count()))
-        
+        args.devices = list(range(torch.cuda.device_count()))  
     # no of process per gpu
     nprocs = len(args.devices)
-    
-    # setting up environ variables for DDP
+    # Environment variables which need to be
+    # set when using c10d's default "env"
+    # initialization mode.
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '29500'
-    
     mp.spawn(main, args=(nprocs, args), nprocs = nprocs, join=True)
     
   
\ No newline at end of file

From a522f0ee4f7109787698a24443038048b651b757 Mon Sep 17 00:00:00 2001
From: Bhavya Patel <bhavya.patel@injala.com>
Date: Thu, 21 Nov 2024 22:43:03 +0530
Subject: [PATCH 3/4] updated script for DDP

---
 references/detection/train_pytorch_DDP.py | 83 +++++++++++------------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/references/detection/train_pytorch_DDP.py b/references/detection/train_pytorch_DDP.py
index 3675a73eee..63df84885e 100644
--- a/references/detection/train_pytorch_DDP.py
+++ b/references/detection/train_pytorch_DDP.py
@@ -9,20 +9,21 @@
 
 import datetime
 import hashlib
+import multiprocessing
 import time
 import numpy as np
 import torch
-import wandb
-from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torchvision.transforms.v2 import Compose, GaussianBlur, Normalize, RandomGrayscale, RandomPhotometricDistort
-from tqdm.auto import tqdm
 
 # The following import is required for DDP
-import torch.distributed as dist 
+import torch.distributed as dist
 import torch.multiprocessing as mp
+import wandb
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR
+from torch.utils.data import DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
+from torchvision.transforms.v2 import Compose, GaussianBlur, Normalize, RandomGrayscale, RandomPhotometricDistort
+from tqdm.auto import tqdm
 
 from doctr import transforms as T
 from doctr.datasets import DetectionDataset
@@ -137,7 +138,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
 
 @torch.no_grad()
-def evaluate(model, val_loader, batch_transforms, val_metric,args, amp=False):
+def evaluate(model, val_loader, batch_transforms, val_metric, args, amp=False):
     # Model in eval mode
     model.eval()
     # Reset val metric
@@ -170,24 +171,21 @@ def evaluate(model, val_loader, batch_transforms, val_metric,args, amp=False):
     return val_loss, recall, precision, mean_iou
 
 
-def main(rank:int, world_size:int, args):
-    
+def main(rank: int, world_size: int, args):
     """
-        Args:
-        ----
+    Args:
         rank (int): device id to put the model on
         world_size (int): number of processes participating in the job
         args: other arguments passed through the CLI
     """
 
-    
     print(args)
 
     if rank == 0 and args.push_to_hub:
         login_to_hub()
 
     if not isinstance(args.workers, int):
-        args.workers = min(16, mp.cpu_count())
+        args.workers = min(16, multiprocessing.cpu_count())
 
     torch.backends.cudnn.benchmark = True
 
@@ -227,7 +225,7 @@ def main(rank:int, world_size:int, args):
         print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)")
         with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
             val_hash = hashlib.sha256(f.read()).hexdigest()
-        
+
         class_names = val_set.class_names
     else:
         class_names = None
@@ -248,13 +246,12 @@ def main(rank:int, world_size:int, args):
         model.load_state_dict(checkpoint)
 
     # create default process group
-    device = torch.device('cuda', args.devices[rank])
-    dist.init_process_group(args.backend, rank = rank, world_size = world_size)
+    device = torch.device("cuda", args.devices[rank])
+    dist.init_process_group(args.backend, rank=rank, world_size=world_size)
     # create local model
     model = model.to(device)
-    # construct the DDP model  
-    model = DDP(model, device_ids = [device])
-    
+    # construct the DDP model
+    model = DDP(model, device_ids=[device])
 
     if rank == 0:
         # Metrics
@@ -262,7 +259,9 @@ def main(rank:int, world_size:int, args):
 
     if rank == 0 and args.test_only:
         print("Running evaluation")
-        val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric,args, amp=args.amp)
+        val_loss, recall, precision, mean_iou = evaluate(
+            model, val_loader, batch_transforms, val_metric, args, amp=args.amp
+        )
         print(
             f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | "
             f"Mean IoU: {mean_iou:.2%})"
@@ -286,7 +285,7 @@ def main(rank:int, world_size:int, args):
         RandomPhotometricDistort(p=0.3),
         lambda x: x,  # Identity no transformation
     ])
-    
+
     # Image + target augmentations
     sample_transforms = T.SampleCompose(
         (
@@ -312,7 +311,6 @@ def main(rank:int, world_size:int, args):
             ]
         )
     )
-    
 
     # Load both train and val data generators
     train_set = DetectionDataset(
@@ -328,17 +326,18 @@ def main(rank:int, world_size:int, args):
         batch_size=args.batch_size,
         drop_last=True,
         num_workers=args.workers,
-        # sampler=(train_set),
-        sampler = DistributedSampler(train_set, num_replicas = world_size, rank = rank, shuffle = False, drop_last = True),
+        sampler=DistributedSampler(train_set, num_replicas=world_size, rank=rank, shuffle=False, drop_last=True),
         pin_memory=torch.cuda.is_available(),
         collate_fn=train_set.collate_fn,
     )
-    print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches) along with DistributedSampler")
-    
+    print(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)"
+    )
+
     with open(os.path.join(args.train_path, "labels.json"), "rb") as f:
         train_hash = hashlib.sha256(f.read()).hexdigest()
 
-    if args.show_samples:
+    if rank == 0 and args.show_samples:
         x, target = next(iter(train_loader))
         plot_samples(x, target)
         # return
@@ -374,7 +373,7 @@ def main(rank:int, world_size:int, args):
     exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
 
     # W&B
-    if rank==0 and args.wb:
+    if rank == 0 and args.wb:
         run = wandb.init(
             name=exp_name,
             project="text-detection",
@@ -401,14 +400,15 @@ def main(rank:int, world_size:int, args):
     if args.early_stop:
         early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
 
-    
     # Training loop
     for epoch in range(args.epochs):
         fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp)
-        
+
         if rank == 0:
             # Validation loop at the end of each epoch
-            val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric,args, amp=args.amp)
+            val_loss, recall, precision, mean_iou = evaluate(
+                model, val_loader, batch_transforms, val_metric, args, amp=args.amp
+            )
             if val_loss < min_loss:
                 print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
                 torch.save(model.module.state_dict(), f"./{exp_name}.pt")
@@ -433,14 +433,13 @@ def main(rank:int, world_size:int, args):
             if args.early_stop and early_stopper.early_stop(val_loss):
                 print("Training halted early due to reaching patience limit.")
                 break
-         
+
     if rank == 0:
         if args.wb:
             run.finish()
 
         if args.push_to_hub:
             push_to_hf_hub(model, exp_name, task="detection", run_config=args)
-                
 
 
 def parse_args():
@@ -452,7 +451,7 @@ def parse_args():
     )
 
     # DDP related args
-    parser.add_argument('--backend', default='nccl', type=str, help='backend to use for torch DDP')
+    parser.add_argument("--backend", default="nccl", type=str, help="backend to use for torch DDP")
 
     parser.add_argument("arch", type=str, help="text-detection model to train")
     parser.add_argument("--train_path", type=str, required=True, help="path to training data folder")
@@ -460,7 +459,7 @@ def parse_args():
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
-    parser.add_argument("--devices", default=None, nargs='+',type=int, help="GPU devices to use for training")
+    parser.add_argument("--devices", default=None, nargs="+", type=int, help="GPU devices to use for training")
     parser.add_argument(
         "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch"
     )
@@ -506,17 +505,15 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
     if not torch.cuda.is_available():
-        raise AssertionError('PyTorch cannot access your GPUs. please look into it bro !!!')
-    
+        raise AssertionError("PyTorch cannot access your GPUs. please look into it bro !!!")
+
     if not isinstance(args.devices, list):
-        args.devices = list(range(torch.cuda.device_count()))  
+        args.devices = list(range(torch.cuda.device_count()))
     # no of process per gpu
     nprocs = len(args.devices)
     # Environment variables which need to be
     # set when using c10d's default "env"
     # initialization mode.
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '29500'
-    mp.spawn(main, args=(nprocs, args), nprocs = nprocs, join=True)
-    
-  
\ No newline at end of file
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+    mp.spawn(main, args=(nprocs, args), nprocs=nprocs, join=True)

From d9f0eef61bd11ee00b13037006d7ea5859b8e258 Mon Sep 17 00:00:00 2001
From: Juneja Sarjil <sarjil.juneja@injala.com>
Date: Tue, 3 Dec 2024 01:25:20 +0530
Subject: [PATCH 4/4] adding Gujarati language vocabulary

---
 doctr/datasets/vocabs.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py
index 94942d58e3..281a739fe1 100644
--- a/doctr/datasets/vocabs.py
+++ b/doctr/datasets/vocabs.py
@@ -23,15 +23,23 @@
     "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
     "hindi_digits": "०१२३४५६७८९",
     "hindi_punctuation": "।,?!:्ॐ॰॥॰",
+    "gujarati_consonants":"કખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહષજ્ઞ",
+    "gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ ",
+    "gujarati_digits":"૦૧૨૩૪૫૬૭૮૯",
+    "gujarati_diacritics":"""કકાકિકીકુકૂકૃકેકૈકોકૌકંકઃ ખખાખિખીખુખૂખૃખેખૈખોખૌખંખઃ ગગાગિગીગુગૂગૃગેગૈગોગૌગંગઃ ઘઘાઘિઘીઘુઘૂઘૃઘેઘૈઘોઘૌઘંઘઃ ઙઙાઙિઙીઙુઙૂઙૃઙેઙૈઙોઙૌઙંઙઃ ચચાચિચીચુચૂચૃચેચૈચોચૌચંચઃ 
+    છછાછિછીછુછૂછૃછેછૈછોછૌછંછઃ જજાજિજીજુજુજૃજેજૈજોજૌજંજઃ ઝઝાઝિઝીઝુઝૂઝૃઝેઝૈઝોઝૌઝંઝઃ ઞઞાઞિઞીઞુઞૂઞૃઞેઞૈઞોઞૌઞંઞઃ ટટાટિટીટુટૂટૃટેટૈટોટૌટંટઃ ઠઠાઠિઠીઠુઠૂઠૃઠેઠૈઠોઠૌઠંઠઃ ડડાડિડીડુડૂડૃડેડૈડોડૌડંડઃ ઢઢાઢિઢીઢુઢૂઢૃઢેઢૈઢોઢૌઢંઢઃ ણણાણિણીણુણૂણૃણેણૈણોણૌણંણઃ 
+    તતાતિતીતુતૂતૃતેતૈતોતૌતંતઃ થથાથિથીથુથૂથૃથીથૈથોથૌથંથઃ દદાદિદીદુદૂદૃદેદૈદોદૌદંદઃ ધધાધિધીધુધૂધૃધેધૈધોધૌધંધઃ નનાનિનીનુનૂનૃનેનૈનોનૌનંનઃ પપાપિપીપુપૂપૃપેપૈપોપૌપંપઃ ફફાફિફીફુફૂફૃફેફૈફોફૌફંફઃ બબાબિબીબુબૂબૃબેબૈબોબૌબંબઃ ભભાભિભીભુભૂભૃભેભૈભોભૌભંભઃ 
+    મમામિમીમુમૂમૃમેમામોમાયમંમઃ યયાયિયીયુયુયૃયેયૈયોયૌયંયઃ રરારિરીરૂરૃરેરૈરોરૌરંરઃ લલાલિલીલુલૂલૃલેલૈલોલૌલંલઃ વવાવિવીવિવૂવૃવેવૈવોવૈવંવઃ શશાશિશીશુશૂશૃશેશૈશોશૌશંશઃ ષષાષિષીષુષૂષૃષેષૈષોષૌષંષઃ જ્ઞજ્ઞાજ્ઞિજ્ઞીજ્ઞુજ્ઞૂજ્ઞૃજ્ઞેજ્ઞૈજ્ઞોજ્ઞૌજ્ઞંજ્ઞઃ """,
+    "gujarati_punctuation":",.!?:;'()[]-_/|\✶૰૱`'",
     "bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
     "bangla_digits": "০১২৩৪৫৬৭৮৯",
     "generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ",
 }
 
 VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
-VOCABS["english"] = VOCABS["latin"] + "°" + VOCABS["currency"]
+VOCABS["english"] = VOCABS["latin"] + "©" +VOCABS["currency"].replace("¥",' ').replace('€','™').replace('¢','®')
 VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"]
-VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ"
+VOCABS["french"] = VOCABS["english"] + "àâéèêëáîôùûüÀÂÉÈÊËÎÏÔÙÛÜÚÇ" 
 VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ"
 VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿"
 VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ"
@@ -59,6 +67,13 @@
 )
 VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
 VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
+VOCABS["gujarati"] = (         
+    VOCABS['gujarati_consonants'] 
+    + VOCABS["gujarati_vowels"] 
+    + VOCABS['gujarati_digits'] 
+    + VOCABS['gujarati_diacritics'] 
+    + VOCABS['gujarati_punctuation']
+)
 VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
 VOCABS["ukrainian"] = (
     VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"