Skip to content

Commit

Permalink
ultralytics 8.2.71 Multinode DDP training (ultralytics#14879)
Browse files Browse the repository at this point in the history
Co-authored-by: Haris Rehman <[email protected]>
Co-authored-by: Glenn Jocher <[email protected]>
Co-authored-by: UltralyticsAssistant <[email protected]>
  • Loading branch information
4 people authored Aug 1, 2024
1 parent 16fc325 commit 9c5d1a2
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
requirements.txt
setup.py
ultralytics.egg-info

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
2 changes: 1 addition & 1 deletion ultralytics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license

__version__ = "8.2.70"
__version__ = "8.2.71"

import os

Expand Down
5 changes: 3 additions & 2 deletions ultralytics/engine/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
from ultralytics.utils import (
DEFAULT_CFG,
LOCAL_RANK,
LOGGER,
RANK,
TQDM,
Expand Down Expand Up @@ -129,7 +130,7 @@ def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):

# Model and Dataset
self.model = check_model_file_from_stem(self.args.model) # add suffix, i.e. yolov8n -> yolov8n.pt
with torch_distributed_zero_first(RANK): # avoid auto-downloading dataset multiple times
with torch_distributed_zero_first(LOCAL_RANK): # avoid auto-downloading dataset multiple times
self.trainset, self.testset = self.get_dataset()
self.ema = None

Expand Down Expand Up @@ -285,7 +286,7 @@ def _setup_train(self, world_size):

# Dataloaders
batch_size = self.batch_size // max(world_size, 1)
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=LOCAL_RANK, mode="train")
if RANK in {-1, 0}:
# Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
self.test_loader = self.get_dataloader(
Expand Down
3 changes: 2 additions & 1 deletion ultralytics/utils/torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@
def torch_distributed_zero_first(local_rank: int):
"""Ensures all processes in distributed training wait for the local master (rank 0) to complete a task first."""
initialized = dist.is_available() and dist.is_initialized()

if initialized and local_rank not in {-1, 0}:
dist.barrier(device_ids=[local_rank])
yield
if initialized and local_rank == 0:
dist.barrier(device_ids=[0])
dist.barrier(device_ids=[local_rank])


def smart_inference_mode():
Expand Down

0 comments on commit 9c5d1a2

Please sign in to comment.