Skip to content

Commit

Permalink
Support MUSA
Browse files Browse the repository at this point in the history
  • Loading branch information
fmo-mt committed Jul 8, 2024
1 parent 6e3654e commit bf70e58
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 36 deletions.
20 changes: 10 additions & 10 deletions ultralytics/engine/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def train(self):
world_size = len(self.args.device.split(","))
elif isinstance(self.args.device, (tuple, list)): # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
world_size = len(self.args.device)
elif torch.cuda.is_available(): # i.e. device=None or device='' or device=number
elif torch.musa.is_available(): # i.e. device=None or device='' or device=number
world_size = 1 # default to device 0
else: # i.e. device='cpu' or 'mps'
world_size = 0
Expand Down Expand Up @@ -213,12 +213,12 @@ def _setup_scheduler(self):

def _setup_ddp(self, world_size):
"""Initializes and sets the DistributedDataParallel parameters for training."""
torch.cuda.set_device(RANK)
self.device = torch.device("cuda", RANK)
torch.musa.set_device(RANK)
self.device = torch.device("musa", RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
os.environ["TORCH_MCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group(
backend="nccl" if dist.is_nccl_available() else "gloo",
backend="mccl",
timeout=timedelta(seconds=10800), # 3 hours
rank=RANK,
world_size=world_size,
Expand Down Expand Up @@ -264,7 +264,7 @@ def _setup_train(self, world_size):
if RANK > -1 and world_size > 1: # DDP
dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None)
self.amp = bool(self.amp) # as boolean
self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
self.scaler = torch.musa.amp.GradScaler(enabled=self.amp)
if world_size > 1:
self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK])

Expand Down Expand Up @@ -376,7 +376,7 @@ def _do_train(self, world_size=1):
x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])

# Forward
with torch.cuda.amp.autocast(self.amp):
with torch.musa.amp.autocast(self.amp):
batch = self.preprocess_batch(batch)
self.loss, self.loss_items = self.model(batch)
if RANK != -1:
Expand Down Expand Up @@ -404,7 +404,7 @@ def _do_train(self, world_size=1):
break

# Log
mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" # (GB)
mem = f"{torch.musa.memory_reserved() / 1E9 if torch.musa.is_available() else 0:.3g}G" # (GB)
loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if RANK in {-1, 0}:
Expand Down Expand Up @@ -449,7 +449,7 @@ def _do_train(self, world_size=1):
self.stop |= epoch >= self.epochs # stop if exceeded epochs
self.run_callbacks("on_fit_epoch_end")
gc.collect()
torch.cuda.empty_cache() # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors
torch.musa.empty_cache() # clear GPU memory at end of epoch, may help reduce MUSA out of memory errors

# Early Stopping
if RANK != -1: # if DDP training
Expand All @@ -471,7 +471,7 @@ def _do_train(self, world_size=1):
self.plot_metrics()
self.run_callbacks("on_train_end")
gc.collect()
torch.cuda.empty_cache()
torch.musa.empty_cache()
self.run_callbacks("teardown")

def save_model(self):
Expand Down
52 changes: 26 additions & 26 deletions ultralytics/utils/torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def select_device(device="", batch=0, newline=False, verbose=True):
Args:
device (str | torch.device, optional): Device string or torch.device object.
Options are 'None', 'cpu', or 'cuda', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects
Options are 'None', 'cpu', or 'musa', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects
the first available GPU, or CPU if no GPU is available.
batch (int, optional): Batch size being used in your model. Defaults to 0.
newline (bool, optional): If True, adds a newline at the end of the log string. Defaults to False.
Expand All @@ -100,52 +100,52 @@ def select_device(device="", batch=0, newline=False, verbose=True):
devices when using multiple GPUs.
Examples:
>>> select_device('cuda:0')
device(type='cuda', index=0)
>>> select_device('musa:0')
device(type='musa', index=0)
>>> select_device('cpu')
device(type='cpu')
Note:
Sets the 'CUDA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use.
Sets the 'MUSA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use.
"""

if isinstance(device, torch.device):
return device

s = f"Ultralytics YOLOv{__version__} 🚀 Python-{PYTHON_VERSION} torch-{torch.__version__} "
device = str(device).lower()
for remove in "cuda:", "none", "(", ")", "[", "]", "'", " ":
device = device.replace(remove, "") # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1'
for remove in "musa:", "none", "(", ")", "[", "]", "'", " ":
device = device.replace(remove, "") # to string, 'musa:0' -> '0' and '(0, 1)' -> '0,1'
cpu = device == "cpu"
mps = device in {"mps", "mps:0"} # Apple Metal Performance Shaders (MPS)
if cpu or mps:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # force torch.cuda.is_available() = False
os.environ["MUSA_VISIBLE_DEVICES"] = "-1" # force torch.musa.is_available() = False
elif device: # non-cpu device requested
if device == "cuda":
if device == "musa":
device = "0"
visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
os.environ["CUDA_VISIBLE_DEVICES"] = device # set environment variable - must be before assert is_available()
if not (torch.cuda.is_available() and torch.cuda.device_count() >= len(device.split(","))):
if not (torch.musa.is_available() and torch.musa.device_count() >= len(device.split(","))):
LOGGER.info(s)
install = (
"See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no "
"CUDA devices are seen by torch.\n"
if torch.cuda.device_count() == 0
"MUSA devices are seen by torch.\n"
if torch.musa.device_count() == 0
else ""
)
raise ValueError(
f"Invalid CUDA 'device={device}' requested."
f" Use 'device=cpu' or pass valid CUDA device(s) if available,"
f"Invalid MUSA 'device={device}' requested."
f" Use 'device=cpu' or pass valid MUSA device(s) if available,"
f" i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.\n"
f"\ntorch.cuda.is_available(): {torch.cuda.is_available()}"
f"\ntorch.cuda.device_count(): {torch.cuda.device_count()}"
f"\ntorch.musa.is_available(): {torch.musa.is_available()}"
f"\ntorch.musa.device_count(): {torch.musa.device_count()}"
f"\nos.environ['CUDA_VISIBLE_DEVICES']: {visible}\n"
f"{install}"
)

if not cpu and not mps and torch.cuda.is_available(): # prefer GPU if available
devices = device.split(",") if device else "0" # range(torch.cuda.device_count()) # i.e. 0,1,6,7
if not cpu and not mps and torch.musa.is_available(): # prefer GPU if available
devices = device.split(",") if device else "0" # range(torch.musa.device_count()) # i.e. 0,1,6,7
n = len(devices) # device count
if n > 1: # multi-GPU
if batch < 1:
Expand All @@ -160,9 +160,9 @@ def select_device(device="", batch=0, newline=False, verbose=True):
)
space = " " * (len(s) + 1)
for i, d in enumerate(devices):
p = torch.cuda.get_device_properties(i)
s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB
arg = "cuda:0"
p = torch.musa.get_device_properties(i)
s += f"{'' if i == 0 else space}MUSA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB
arg = "musa:0"
elif mps and TORCH_2_0 and torch.backends.mps.is_available():
# Prefer MPS if available
s += f"MPS ({get_cpu_info()})\n"
Expand All @@ -178,8 +178,8 @@ def select_device(device="", batch=0, newline=False, verbose=True):

def time_sync():
"""PyTorch-accurate time."""
if torch.cuda.is_available():
torch.cuda.synchronize()
if torch.musa.is_available():
torch.musa.synchronize()
return time.time()


Expand Down Expand Up @@ -440,8 +440,8 @@ def init_seeds(seed=0, deterministic=False):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # for Multi-GPU, exception safe
torch.musa.manual_seed(seed)
torch.musa.manual_seed_all(seed) # for Multi-GPU, exception safe
# torch.backends.cudnn.benchmark = True # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287
if deterministic:
if TORCH_2_0:
Expand Down Expand Up @@ -615,7 +615,7 @@ def profile(input, ops, n=10, device=None):
t[2] = float("nan")
tf += (t[1] - t[0]) * 1000 / n # ms per op forward
tb += (t[2] - t[1]) * 1000 / n # ms per op backward
mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0 # (GB)
mem = torch.musa.memory_reserved() / 1e9 if torch.musa.is_available() else 0 # (GB)
s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y)) # shapes
p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0 # parameters
LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")
Expand All @@ -624,7 +624,7 @@ def profile(input, ops, n=10, device=None):
LOGGER.info(e)
results.append(None)
gc.collect() # attempt to free unused memory
torch.cuda.empty_cache()
torch.musa.empty_cache()
return results


Expand Down

0 comments on commit bf70e58

Please sign in to comment.