ultralytics 8.2.71 Multinode DDP training (ultralytics#14879)

Co-authored-by: Haris Rehman <[email protected]> Co-authored-by: Glenn Jocher <[email protected]> Co-authored-by: UltralyticsAssistant <[email protected]>
Burhan-Q · Aug 1, 2024 · 9c5d1a2 · 9c5d1a2
1 parent 16fc325
commit 9c5d1a2
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,9 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+requirements.txt
+setup.py
+ultralytics.egg-info
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
-__version__ = "8.2.70"
+__version__ = "8.2.71"
 
 import os
 

diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py
@@ -26,6 +26,7 @@
 from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
 from ultralytics.utils import (
     DEFAULT_CFG,
+    LOCAL_RANK,
     LOGGER,
     RANK,
     TQDM,
@@ -129,7 +130,7 @@ def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
 
         # Model and Dataset
         self.model = check_model_file_from_stem(self.args.model)  # add suffix, i.e. yolov8n -> yolov8n.pt
-        with torch_distributed_zero_first(RANK):  # avoid auto-downloading dataset multiple times
+        with torch_distributed_zero_first(LOCAL_RANK):  # avoid auto-downloading dataset multiple times
             self.trainset, self.testset = self.get_dataset()
         self.ema = None
 
@@ -285,7 +286,7 @@ def _setup_train(self, world_size):
 
         # Dataloaders
         batch_size = self.batch_size // max(world_size, 1)
-        self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
+        self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=LOCAL_RANK, mode="train")
         if RANK in {-1, 0}:
             # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
             self.test_loader = self.get_dataloader(

diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py
@@ -48,11 +48,12 @@
 def torch_distributed_zero_first(local_rank: int):
     """Ensures all processes in distributed training wait for the local master (rank 0) to complete a task first."""
     initialized = dist.is_available() and dist.is_initialized()
+
     if initialized and local_rank not in {-1, 0}:
         dist.barrier(device_ids=[local_rank])
     yield
     if initialized and local_rank == 0:
-        dist.barrier(device_ids=[0])
+        dist.barrier(device_ids=[local_rank])
 
 
 def smart_inference_mode():