From 81aebeeaf64a0cb490a6d54e35bd63bc233b2f92 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Wed, 15 Nov 2023 15:02:04 -0500 Subject: [PATCH 1/7] Add CUDA_VISIBLE_DEVICES enviroment variable when using the --gpus flag --- mlcube/mlcube/parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mlcube/mlcube/parser.py b/mlcube/mlcube/parser.py index 17ee4269..0ef794db 100644 --- a/mlcube/mlcube/parser.py +++ b/mlcube/mlcube/parser.py @@ -128,6 +128,9 @@ def parse_extra_arg( if parsed_args.get("gpus", None): if platform == "docker": runner_run_args["--gpus"] = parsed_args["gpus"] + os.environ["CUDA_VISIBLE_DEVICES"] = parsed_args[ + "gpus" + ] else: runner_run_args["--nv"] = "" os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = parsed_args[ From 7a5fea8d5a9c23bc97c8eeb08b992b66f9530289 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 28 Nov 2023 16:03:12 -0500 Subject: [PATCH 2/7] Fix logic when defining enviroment variable --- mlcube/mlcube/parser.py | 3 --- runners/mlcube_docker/mlcube_docker/docker_run.py | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlcube/mlcube/parser.py b/mlcube/mlcube/parser.py index 0ef794db..17ee4269 100644 --- a/mlcube/mlcube/parser.py +++ b/mlcube/mlcube/parser.py @@ -128,9 +128,6 @@ def parse_extra_arg( if parsed_args.get("gpus", None): if platform == "docker": runner_run_args["--gpus"] = parsed_args["gpus"] - os.environ["CUDA_VISIBLE_DEVICES"] = parsed_args[ - "gpus" - ] else: runner_run_args["--nv"] = "" os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = parsed_args[ diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index a8f918fe..d2c5870f 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -266,6 +266,9 @@ def run(self) -> None: if extra_args: run_args += " " + extra_args + cuda_visible_devices = self.mlcube.runner["--gpus"] if "--gpus" in self.mlcube.runner else num_gpus + run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" + if "entrypoint" in self.mlcube.tasks[self.task]: logger.info( "Using custom task entrypoint: task=%s, entrypoint='%s'", From da9d74a5fa3dc370ba47312b5ea8d4caaed8b76f Mon Sep 17 00:00:00 2001 From: David Jurado Date: Thu, 30 Nov 2023 13:51:14 -0500 Subject: [PATCH 3/7] Fix validation when using --gpus flag --- runners/mlcube_docker/mlcube_docker/docker_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index d2c5870f..3b96ea3b 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -266,7 +266,8 @@ def run(self) -> None: if extra_args: run_args += " " + extra_args - cuda_visible_devices = self.mlcube.runner["--gpus"] if "--gpus" in self.mlcube.runner else num_gpus + valid_gpu_flag = "--gpus" in self.mlcube.runner and self.mlcube.runner["--gpus"] is not None + cuda_visible_devices = self.mlcube.runner["--gpus"] if valid_gpu_flag else num_gpus run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" if "entrypoint" in self.mlcube.tasks[self.task]: From b2aea7665aa5df8c1d10908a42f7809d37d19e0d Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 8 Dec 2023 10:19:46 -0500 Subject: [PATCH 4/7] Fix gpu parsing logic --- .../mlcube_docker/mlcube_docker/docker_run.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index 3b96ea3b..2c84140c 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -267,7 +267,21 @@ def run(self) -> None: run_args += " " + extra_args valid_gpu_flag = "--gpus" in self.mlcube.runner and self.mlcube.runner["--gpus"] is not None - cuda_visible_devices = self.mlcube.runner["--gpus"] if valid_gpu_flag else num_gpus + + + if valid_gpu_flag: + cuda_visible_devices = self.mlcube.runner["--gpus"] + if "device" in cuda_visible_devices: + cuda_visible_devices = cuda_visible_devices.replace("device=", "") + else: + cuda_visible_devices = num_gpus + if num_gpus == 0: + cuda_visible_devices = "" + + if cuda_visible_devices.isnumeric(): + cuda_visible_devices = str(list(range(cuda_visible_devices))) + cuda_visible_devices = cuda_visible_devices.replace(" ", "").replace("[","").replace("]","") + run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" if "entrypoint" in self.mlcube.tasks[self.task]: From bd47fd069d31f944b7e1c8c1bab58d406f46ec05 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 8 Dec 2023 16:07:09 -0500 Subject: [PATCH 5/7] Fix gpu parsing logic when numeric --- runners/mlcube_docker/mlcube_docker/docker_run.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index 2c84140c..b3b021f0 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -275,12 +275,10 @@ def run(self) -> None: cuda_visible_devices = cuda_visible_devices.replace("device=", "") else: cuda_visible_devices = num_gpus - if num_gpus == 0: - cuda_visible_devices = "" if cuda_visible_devices.isnumeric(): cuda_visible_devices = str(list(range(cuda_visible_devices))) - cuda_visible_devices = cuda_visible_devices.replace(" ", "").replace("[","").replace("]","") + cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1] run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" From b6bc2df5e0c297e127f355c00be048c1ccff0d68 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 12 Dec 2023 10:09:15 -0500 Subject: [PATCH 6/7] Fix bug when using int value --- runners/mlcube_docker/mlcube_docker/docker_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index b3b021f0..c6df84dd 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -276,8 +276,8 @@ def run(self) -> None: else: cuda_visible_devices = num_gpus - if cuda_visible_devices.isnumeric(): - cuda_visible_devices = str(list(range(cuda_visible_devices))) + if str(cuda_visible_devices).isnumeric(): + cuda_visible_devices = str(list(range(int(cuda_visible_devices)))) cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1] run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" From c79b8969f84f3e49b1251a3356db03cd9cf4e3fb Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 19 Dec 2023 08:52:11 -0500 Subject: [PATCH 7/7] Fix parser logic --- mlcube/mlcube/parser.py | 12 ++++++++---- runners/mlcube_docker/mlcube_docker/docker_run.py | 3 --- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mlcube/mlcube/parser.py b/mlcube/mlcube/parser.py index 17ee4269..3097bf50 100644 --- a/mlcube/mlcube/parser.py +++ b/mlcube/mlcube/parser.py @@ -126,13 +126,17 @@ def parse_extra_arg( key = "--security-opt" if platform == "docker" else "--security" runner_run_args[key] = parsed_args["security"] if parsed_args.get("gpus", None): + cuda_visible_devices = parsed_args["gpus"] + if "device" in cuda_visible_devices: + cuda_visible_devices = cuda_visible_devices.replace("device=", "") + elif str(cuda_visible_devices).isnumeric(): + cuda_visible_devices = str(list(range(int(cuda_visible_devices)))) + cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1] if platform == "docker": - runner_run_args["--gpus"] = parsed_args["gpus"] + runner_run_args["--gpus"] = cuda_visible_devices else: runner_run_args["--nv"] = "" - os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = parsed_args[ - "gpus" - ] + os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = cuda_visible_devices if parsed_args.get("memory", None): key = "--memory" if platform == "docker" else "--vm-ram" runner_run_args[key] = parsed_args["memory"] diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index c6df84dd..4829bc65 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -268,11 +268,8 @@ def run(self) -> None: valid_gpu_flag = "--gpus" in self.mlcube.runner and self.mlcube.runner["--gpus"] is not None - if valid_gpu_flag: cuda_visible_devices = self.mlcube.runner["--gpus"] - if "device" in cuda_visible_devices: - cuda_visible_devices = cuda_visible_devices.replace("device=", "") else: cuda_visible_devices = num_gpus