From bf70e582af248139f66722580827f593c8b022ce Mon Sep 17 00:00:00 2001 From: "fan.mo" Date: Mon, 8 Jul 2024 15:40:55 +0800 Subject: [PATCH] Support MUSA --- ultralytics/engine/trainer.py | 20 ++++++------ ultralytics/utils/torch_utils.py | 52 ++++++++++++++++---------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py index c833e761655..8be45f99f0a 100644 --- a/ultralytics/engine/trainer.py +++ b/ultralytics/engine/trainer.py @@ -172,7 +172,7 @@ def train(self): world_size = len(self.args.device.split(",")) elif isinstance(self.args.device, (tuple, list)): # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list) world_size = len(self.args.device) - elif torch.cuda.is_available(): # i.e. device=None or device='' or device=number + elif torch.musa.is_available(): # i.e. device=None or device='' or device=number world_size = 1 # default to device 0 else: # i.e. device='cpu' or 'mps' world_size = 0 @@ -213,12 +213,12 @@ def _setup_scheduler(self): def _setup_ddp(self, world_size): """Initializes and sets the DistributedDataParallel parameters for training.""" - torch.cuda.set_device(RANK) - self.device = torch.device("cuda", RANK) + torch.musa.set_device(RANK) + self.device = torch.device("musa", RANK) # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') - os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout + os.environ["TORCH_MCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout dist.init_process_group( - backend="nccl" if dist.is_nccl_available() else "gloo", + backend="mccl", timeout=timedelta(seconds=10800), # 3 hours rank=RANK, world_size=world_size, @@ -264,7 +264,7 @@ def _setup_train(self, world_size): if RANK > -1 and world_size > 1: # DDP dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None) self.amp = bool(self.amp) # as boolean - self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp) + self.scaler = torch.musa.amp.GradScaler(enabled=self.amp) if world_size > 1: self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK]) @@ -376,7 +376,7 @@ def _do_train(self, world_size=1): x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum]) # Forward - with torch.cuda.amp.autocast(self.amp): + with torch.musa.amp.autocast(self.amp): batch = self.preprocess_batch(batch) self.loss, self.loss_items = self.model(batch) if RANK != -1: @@ -404,7 +404,7 @@ def _do_train(self, world_size=1): break # Log - mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" # (GB) + mem = f"{torch.musa.memory_reserved() / 1E9 if torch.musa.is_available() else 0:.3g}G" # (GB) loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1 losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0) if RANK in {-1, 0}: @@ -449,7 +449,7 @@ def _do_train(self, world_size=1): self.stop |= epoch >= self.epochs # stop if exceeded epochs self.run_callbacks("on_fit_epoch_end") gc.collect() - torch.cuda.empty_cache() # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors + torch.musa.empty_cache() # clear GPU memory at end of epoch, may help reduce MUSA out of memory errors # Early Stopping if RANK != -1: # if DDP training @@ -471,7 +471,7 @@ def _do_train(self, world_size=1): self.plot_metrics() self.run_callbacks("on_train_end") gc.collect() - torch.cuda.empty_cache() + torch.musa.empty_cache() self.run_callbacks("teardown") def save_model(self): diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py index 0016c970341..df74d780fba 100644 --- a/ultralytics/utils/torch_utils.py +++ b/ultralytics/utils/torch_utils.py @@ -86,7 +86,7 @@ def select_device(device="", batch=0, newline=False, verbose=True): Args: device (str | torch.device, optional): Device string or torch.device object. - Options are 'None', 'cpu', or 'cuda', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects + Options are 'None', 'cpu', or 'musa', or '0' or '0,1,2,3'. Defaults to an empty string, which auto-selects the first available GPU, or CPU if no GPU is available. batch (int, optional): Batch size being used in your model. Defaults to 0. newline (bool, optional): If True, adds a newline at the end of the log string. Defaults to False. @@ -100,14 +100,14 @@ def select_device(device="", batch=0, newline=False, verbose=True): devices when using multiple GPUs. Examples: - >>> select_device('cuda:0') - device(type='cuda', index=0) + >>> select_device('musa:0') + device(type='musa', index=0) >>> select_device('cpu') device(type='cpu') Note: - Sets the 'CUDA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use. + Sets the 'MUSA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use. """ if isinstance(device, torch.device): @@ -115,37 +115,37 @@ def select_device(device="", batch=0, newline=False, verbose=True): s = f"Ultralytics YOLOv{__version__} 🚀 Python-{PYTHON_VERSION} torch-{torch.__version__} " device = str(device).lower() - for remove in "cuda:", "none", "(", ")", "[", "]", "'", " ": - device = device.replace(remove, "") # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1' + for remove in "musa:", "none", "(", ")", "[", "]", "'", " ": + device = device.replace(remove, "") # to string, 'musa:0' -> '0' and '(0, 1)' -> '0,1' cpu = device == "cpu" mps = device in {"mps", "mps:0"} # Apple Metal Performance Shaders (MPS) if cpu or mps: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # force torch.cuda.is_available() = False + os.environ["MUSA_VISIBLE_DEVICES"] = "-1" # force torch.musa.is_available() = False elif device: # non-cpu device requested - if device == "cuda": + if device == "musa": device = "0" visible = os.environ.get("CUDA_VISIBLE_DEVICES", None) os.environ["CUDA_VISIBLE_DEVICES"] = device # set environment variable - must be before assert is_available() - if not (torch.cuda.is_available() and torch.cuda.device_count() >= len(device.split(","))): + if not (torch.musa.is_available() and torch.musa.device_count() >= len(device.split(","))): LOGGER.info(s) install = ( "See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no " - "CUDA devices are seen by torch.\n" - if torch.cuda.device_count() == 0 + "MUSA devices are seen by torch.\n" + if torch.musa.device_count() == 0 else "" ) raise ValueError( - f"Invalid CUDA 'device={device}' requested." - f" Use 'device=cpu' or pass valid CUDA device(s) if available," + f"Invalid MUSA 'device={device}' requested." + f" Use 'device=cpu' or pass valid MUSA device(s) if available," f" i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.\n" - f"\ntorch.cuda.is_available(): {torch.cuda.is_available()}" - f"\ntorch.cuda.device_count(): {torch.cuda.device_count()}" + f"\ntorch.musa.is_available(): {torch.musa.is_available()}" + f"\ntorch.musa.device_count(): {torch.musa.device_count()}" f"\nos.environ['CUDA_VISIBLE_DEVICES']: {visible}\n" f"{install}" ) - if not cpu and not mps and torch.cuda.is_available(): # prefer GPU if available - devices = device.split(",") if device else "0" # range(torch.cuda.device_count()) # i.e. 0,1,6,7 + if not cpu and not mps and torch.musa.is_available(): # prefer GPU if available + devices = device.split(",") if device else "0" # range(torch.musa.device_count()) # i.e. 0,1,6,7 n = len(devices) # device count if n > 1: # multi-GPU if batch < 1: @@ -160,9 +160,9 @@ def select_device(device="", batch=0, newline=False, verbose=True): ) space = " " * (len(s) + 1) for i, d in enumerate(devices): - p = torch.cuda.get_device_properties(i) - s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB - arg = "cuda:0" + p = torch.musa.get_device_properties(i) + s += f"{'' if i == 0 else space}MUSA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB + arg = "musa:0" elif mps and TORCH_2_0 and torch.backends.mps.is_available(): # Prefer MPS if available s += f"MPS ({get_cpu_info()})\n" @@ -178,8 +178,8 @@ def select_device(device="", batch=0, newline=False, verbose=True): def time_sync(): """PyTorch-accurate time.""" - if torch.cuda.is_available(): - torch.cuda.synchronize() + if torch.musa.is_available(): + torch.musa.synchronize() return time.time() @@ -440,8 +440,8 @@ def init_seeds(seed=0, deterministic=False): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) # for Multi-GPU, exception safe + torch.musa.manual_seed(seed) + torch.musa.manual_seed_all(seed) # for Multi-GPU, exception safe # torch.backends.cudnn.benchmark = True # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287 if deterministic: if TORCH_2_0: @@ -615,7 +615,7 @@ def profile(input, ops, n=10, device=None): t[2] = float("nan") tf += (t[1] - t[0]) * 1000 / n # ms per op forward tb += (t[2] - t[1]) * 1000 / n # ms per op backward - mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0 # (GB) + mem = torch.musa.memory_reserved() / 1e9 if torch.musa.is_available() else 0 # (GB) s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y)) # shapes p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0 # parameters LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}") @@ -624,7 +624,7 @@ def profile(input, ops, n=10, device=None): LOGGER.info(e) results.append(None) gc.collect() # attempt to free unused memory - torch.cuda.empty_cache() + torch.musa.empty_cache() return results