From 8c2ca8a5f0890da6da3f49c5439ec7ef84da42c1 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 22 Nov 2024 07:48:49 +0100 Subject: [PATCH] [misc] Small ddp script adjustments --- references/detection/README.md | 4 ++-- .../{train_pytorch_DDP.py => train_pytorch_ddp.py} | 9 ++++----- references/recognition/train_pytorch_ddp.py | 6 ++++-- 3 files changed, 10 insertions(+), 9 deletions(-) rename references/detection/{train_pytorch_DDP.py => train_pytorch_ddp.py} (98%) diff --git a/references/detection/README.md b/references/detection/README.md index d2bddf6b7..884437e8e 100644 --- a/references/detection/README.md +++ b/references/detection/README.md @@ -24,9 +24,10 @@ or PyTorch: ```shell python references/detection/train_pytorch.py db_resnet50 --train_path path/to/your/train_set --val_path path/to/your/val_set --epochs 5 ``` + ### Multi-GPU support (PyTorch only) -Multi-GPU support on Detection task with PyTorch has been added. +Multi-GPU support on Detection task with PyTorch has been added. Arguments are the same than the ones from single GPU, except: - `--devices`: **by default, if you do not pass `--devices`, it will use all GPUs on your computer**. @@ -41,7 +42,6 @@ device_names = [torch.cuda.get_device_name(d) for d in devices] - `--backend`: you can specify another `backend` for `DistribuedDataParallel` if the default one is not available on your operating system. Fastest one is `nccl` according to [PyTorch Documentation](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html). - ```shell python references/detection/train_pytorch_ddp.py db_resnet50 --train_path path/to/your/train_set --val_path path/to/your/val_set --epochs 5 --devices 0 1 --backend nccl ``` diff --git a/references/detection/train_pytorch_DDP.py b/references/detection/train_pytorch_ddp.py similarity index 98% rename from references/detection/train_pytorch_DDP.py rename to references/detection/train_pytorch_ddp.py index 63df84885..ba0bf5d8f 100644 --- a/references/detection/train_pytorch_DDP.py +++ b/references/detection/train_pytorch_ddp.py @@ -11,6 +11,7 @@ import hashlib import multiprocessing import time + import numpy as np import torch @@ -330,9 +331,7 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print( - f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)" - ) + print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -446,7 +445,7 @@ def parse_args(): import argparse parser = argparse.ArgumentParser( - description="DocTR training script for text detection (PyTorch)", + description="DocTR DDP training script for text detection (PyTorch)", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) @@ -505,7 +504,7 @@ def parse_args(): if __name__ == "__main__": args = parse_args() if not torch.cuda.is_available(): - raise AssertionError("PyTorch cannot access your GPUs. please look into it bro !!!") + raise AssertionError("PyTorch cannot access your GPUs. Please investigate!") if not isinstance(args.devices, list): args.devices = list(range(torch.cuda.device_count())) diff --git a/references/recognition/train_pytorch_ddp.py b/references/recognition/train_pytorch_ddp.py index 8a7f1428b..2f40ed766 100644 --- a/references/recognition/train_pytorch_ddp.py +++ b/references/recognition/train_pytorch_ddp.py @@ -357,10 +357,13 @@ def parse_args(): import argparse parser = argparse.ArgumentParser( - description="DocTR training script for text recognition (PyTorch)", + description="DocTR DDP training script for text recognition (PyTorch)", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + # DDP related args + parser.add_argument("--backend", default="nccl", type=str, help="Backend to use for Torch DDP") + parser.add_argument("arch", type=str, help="text-recognition model to train") parser.add_argument("--train_path", type=str, default=None, help="path to train data folder(s)") parser.add_argument("--val_path", type=str, default=None, help="path to val data folder") @@ -384,7 +387,6 @@ def parse_args(): parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training") - parser.add_argument("--backend", default="nccl", type=str, help="Backend to use for Torch DDP") parser.add_argument("--devices", default=None, nargs="+", type=int, help="GPU devices to use for training") parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H") parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")