From 9c5d1a24511397d1d355eb3d759a49d2939e7123 Mon Sep 17 00:00:00 2001 From: Haris Rehman <115916265+Harri200191@users.noreply.github.com> Date: Thu, 1 Aug 2024 20:31:03 +0500 Subject: [PATCH] `ultralytics 8.2.71` Multinode DDP training (#14879) Co-authored-by: Haris Rehman Co-authored-by: Glenn Jocher Co-authored-by: UltralyticsAssistant --- .gitignore | 3 +++ ultralytics/__init__.py | 2 +- ultralytics/engine/trainer.py | 5 +++-- ultralytics/utils/torch_utils.py | 3 ++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 1b68ec6b2c0..1c0c5fbea61 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,9 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +requirements.txt +setup.py +ultralytics.egg-info # PyInstaller # Usually these files are written by a python script from a template diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index affb8e35c7f..8eb9bc7812d 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = "8.2.70" +__version__ = "8.2.71" import os diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py index 48e95679cff..47063466ada 100644 --- a/ultralytics/engine/trainer.py +++ b/ultralytics/engine/trainer.py @@ -26,6 +26,7 @@ from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights from ultralytics.utils import ( DEFAULT_CFG, + LOCAL_RANK, LOGGER, RANK, TQDM, @@ -129,7 +130,7 @@ def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): # Model and Dataset self.model = check_model_file_from_stem(self.args.model) # add suffix, i.e. yolov8n -> yolov8n.pt - with torch_distributed_zero_first(RANK): # avoid auto-downloading dataset multiple times + with torch_distributed_zero_first(LOCAL_RANK): # avoid auto-downloading dataset multiple times self.trainset, self.testset = self.get_dataset() self.ema = None @@ -285,7 +286,7 @@ def _setup_train(self, world_size): # Dataloaders batch_size = self.batch_size // max(world_size, 1) - self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train") + self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=LOCAL_RANK, mode="train") if RANK in {-1, 0}: # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects. self.test_loader = self.get_dataloader( diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py index 624167694f4..fd24403d5a2 100644 --- a/ultralytics/utils/torch_utils.py +++ b/ultralytics/utils/torch_utils.py @@ -48,11 +48,12 @@ def torch_distributed_zero_first(local_rank: int): """Ensures all processes in distributed training wait for the local master (rank 0) to complete a task first.""" initialized = dist.is_available() and dist.is_initialized() + if initialized and local_rank not in {-1, 0}: dist.barrier(device_ids=[local_rank]) yield if initialized and local_rank == 0: - dist.barrier(device_ids=[0]) + dist.barrier(device_ids=[local_rank]) def smart_inference_mode():