From bf70e582af248139f66722580827f593c8b022ce Mon Sep 17 00:00:00 2001
From: "fan.mo" <fan.mo@mthreads.com>
Date: Mon, 8 Jul 2024 15:40:55 +0800
Subject: [PATCH] Support MUSA

---
 ultralytics/engine/trainer.py    | 20 ++++++------
 ultralytics/utils/torch_utils.py | 52 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py
index c833e761655..8be45f99f0a 100644
--- a/ultralytics/engine/trainer.py
+++ b/ultralytics/engine/trainer.py
@@ -172,7 +172,7 @@ def train(self):
             world_size = len(self.args.device.split(","))
         elif isinstance(self.args.device, (tuple, list)):  # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
             world_size = len(self.args.device)
-        elif torch.cuda.is_available():  # i.e. device=None or device='' or device=number
+        elif torch.musa.is_available():  # i.e. device=None or device='' or device=number
             world_size = 1  # default to device 0
         else:  # i.e. device='cpu' or 'mps'
             world_size = 0
@@ -213,12 +213,12 @@ def _setup_scheduler(self):
 
     def _setup_ddp(self, world_size):
         """Initializes and sets the DistributedDataParallel parameters for training."""
-        torch.cuda.set_device(RANK)
-        self.device = torch.device("cuda", RANK)
+        torch.musa.set_device(RANK)
+        self.device = torch.device("musa", RANK)
         # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
-        os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
+        os.environ["TORCH_MCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
         dist.init_process_group(
-            backend="nccl" if dist.is_nccl_available() else "gloo",
+            backend="mccl",
             timeout=timedelta(seconds=10800),  # 3 hours
             rank=RANK,
             world_size=world_size,
@@ -264,7 +264,7 @@ def _setup_train(self, world_size):
         if RANK > -1 and world_size > 1:  # DDP
             dist.broadcast(self.amp, src=0)  # broadcast the tensor from rank 0 to all other ranks (returns None)
         self.amp = bool(self.amp)  # as boolean
-        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
+        self.scaler = torch.musa.amp.GradScaler(enabled=self.amp)
         if world_size > 1:
             self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK])
 
@@ -376,7 +376,7 @@ def _do_train(self, world_size=1):
                             x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
 
                 # Forward
-                with torch.cuda.amp.autocast(self.amp):
+                with torch.musa.amp.autocast(self.amp):
                     batch = self.preprocess_batch(batch)
                     self.loss, self.loss_items = self.model(batch)
                     if RANK != -1:
@@ -404,7 +404,7 @@ def _do_train(self, world_size=1):
                             break
 
                 # Log
-                mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G"  # (GB)
+                mem = f"{torch.musa.memory_reserved() / 1E9 if torch.musa.is_available() else 0:.3g}G"  # (GB)
                 loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
                 losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
                 if RANK in {-1, 0}:
@@ -449,7 +449,7 @@ def _do_train(self, world_size=1):
                 self.stop |= epoch >= self.epochs  # stop if exceeded epochs
             self.run_callbacks("on_fit_epoch_end")
             gc.collect()
-            torch.cuda.empty_cache()  # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors
+            torch.musa.empty_cache()  # clear GPU memory at end of epoch, may help reduce MUSA out of memory errors
 
             # Early Stopping
             if RANK != -1:  # if DDP training
@@ -471,7 +471,7 @@ def _do_train(self, world_size=1):
                 self.plot_metrics()
             self.run_callbacks("on_train_end")
         gc.collect()
-        torch.cuda.empty_cache()
+        torch.musa.empty_cache()
         self.run_callbacks("teardown")
 
     def save_model(self):
diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py
index 0016c970341..df74d780fba 100644
--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@@ -86,7 +86,7 @@ def select_device(device="", batch=0, newline=False, verbose=True):
 
     Args:
         device (str | torch.device, optional): Device string or torch.device object.
-            Options are 'None', 'cpu', or 'cuda', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects
+            Options are 'None', 'cpu', or 'musa', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects
             the first available GPU, or CPU if no GPU is available.
         batch (int, optional): Batch size being used in your model. Defaults to 0.
         newline (bool, optional): If True, adds a newline at the end of the log string. Defaults to False.
@@ -100,14 +100,14 @@ def select_device(device="", batch=0, newline=False, verbose=True):
             devices when using multiple GPUs.
 
     Examples:
-        >>> select_device('cuda:0')
-        device(type='cuda', index=0)
+        >>> select_device('musa:0')
+        device(type='musa', index=0)
 
         >>> select_device('cpu')
         device(type='cpu')
 
     Note:
-        Sets the 'CUDA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use.
+        Sets the 'MUSA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use.
     """
 
     if isinstance(device, torch.device):
@@ -115,37 +115,37 @@ def select_device(device="", batch=0, newline=False, verbose=True):
 
     s = f"Ultralytics YOLOv{__version__} 🚀 Python-{PYTHON_VERSION} torch-{torch.__version__} "
     device = str(device).lower()
-    for remove in "cuda:", "none", "(", ")", "[", "]", "'", " ":
-        device = device.replace(remove, "")  # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1'
+    for remove in "musa:", "none", "(", ")", "[", "]", "'", " ":
+        device = device.replace(remove, "")  # to string, 'musa:0' -> '0' and '(0, 1)' -> '0,1'
     cpu = device == "cpu"
     mps = device in {"mps", "mps:0"}  # Apple Metal Performance Shaders (MPS)
     if cpu or mps:
-        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # force torch.cuda.is_available() = False
+        os.environ["MUSA_VISIBLE_DEVICES"] = "-1"  # force torch.musa.is_available() = False
     elif device:  # non-cpu device requested
-        if device == "cuda":
+        if device == "musa":
             device = "0"
         visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
         os.environ["CUDA_VISIBLE_DEVICES"] = device  # set environment variable - must be before assert is_available()
-        if not (torch.cuda.is_available() and torch.cuda.device_count() >= len(device.split(","))):
+        if not (torch.musa.is_available() and torch.musa.device_count() >= len(device.split(","))):
             LOGGER.info(s)
             install = (
                 "See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no "
-                "CUDA devices are seen by torch.\n"
-                if torch.cuda.device_count() == 0
+                "MUSA devices are seen by torch.\n"
+                if torch.musa.device_count() == 0
                 else ""
             )
             raise ValueError(
-                f"Invalid CUDA 'device={device}' requested."
-                f" Use 'device=cpu' or pass valid CUDA device(s) if available,"
+                f"Invalid MUSA 'device={device}' requested."
+                f" Use 'device=cpu' or pass valid MUSA device(s) if available,"
                 f" i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.\n"
-                f"\ntorch.cuda.is_available(): {torch.cuda.is_available()}"
-                f"\ntorch.cuda.device_count(): {torch.cuda.device_count()}"
+                f"\ntorch.musa.is_available(): {torch.musa.is_available()}"
+                f"\ntorch.musa.device_count(): {torch.musa.device_count()}"
                 f"\nos.environ['CUDA_VISIBLE_DEVICES']: {visible}\n"
                 f"{install}"
             )
 
-    if not cpu and not mps and torch.cuda.is_available():  # prefer GPU if available
-        devices = device.split(",") if device else "0"  # range(torch.cuda.device_count())  # i.e. 0,1,6,7
+    if not cpu and not mps and torch.musa.is_available():  # prefer GPU if available
+        devices = device.split(",") if device else "0"  # range(torch.musa.device_count())  # i.e. 0,1,6,7
         n = len(devices)  # device count
         if n > 1:  # multi-GPU
             if batch < 1:
@@ -160,9 +160,9 @@ def select_device(device="", batch=0, newline=False, verbose=True):
                 )
         space = " " * (len(s) + 1)
         for i, d in enumerate(devices):
-            p = torch.cuda.get_device_properties(i)
-            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n"  # bytes to MB
-        arg = "cuda:0"
+            p = torch.musa.get_device_properties(i)
+            s += f"{'' if i == 0 else space}MUSA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n"  # bytes to MB
+        arg = "musa:0"
     elif mps and TORCH_2_0 and torch.backends.mps.is_available():
         # Prefer MPS if available
         s += f"MPS ({get_cpu_info()})\n"
@@ -178,8 +178,8 @@ def select_device(device="", batch=0, newline=False, verbose=True):
 
 def time_sync():
     """PyTorch-accurate time."""
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
+    if torch.musa.is_available():
+        torch.musa.synchronize()
     return time.time()
 
 
@@ -440,8 +440,8 @@ def init_seeds(seed=0, deterministic=False):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)  # for Multi-GPU, exception safe
+    torch.musa.manual_seed(seed)
+    torch.musa.manual_seed_all(seed)  # for Multi-GPU, exception safe
     # torch.backends.cudnn.benchmark = True  # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287
     if deterministic:
         if TORCH_2_0:
@@ -615,7 +615,7 @@ def profile(input, ops, n=10, device=None):
                         t[2] = float("nan")
                     tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
                     tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
-                mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0  # (GB)
+                mem = torch.musa.memory_reserved() / 1e9 if torch.musa.is_available() else 0  # (GB)
                 s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y))  # shapes
                 p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0  # parameters
                 LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")
@@ -624,7 +624,7 @@ def profile(input, ops, n=10, device=None):
                 LOGGER.info(e)
                 results.append(None)
             gc.collect()  # attempt to free unused memory
-            torch.cuda.empty_cache()
+            torch.musa.empty_cache()
     return results