Fixed gpu detection for cuda rocm etc using env vars

Signed-off-by: Brian <[email protected]>
containers · Nov 25, 2024 · 69e8acb · 69e8acb
1 parent 590fe3e
commit 69e8acb
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 19 deletions.
diff --git a/ramalama/common.py b/ramalama/common.py
@@ -93,7 +93,7 @@ def run_cmd(args, cwd=None, stdout=subprocess.PIPE, ignore_stderr=False, debug=F
     if ignore_stderr:
         stderr = subprocess.PIPE
 
-    return subprocess.run(args, check=True, cwd=cwd, stdout=stdout, stderr=stderr)
+    return subprocess.run(args, check=True, cwd=cwd, stdout=stdout, stderr=stderr, text=True)
 
 
 def find_working_directory():

diff --git a/ramalama/model.py b/ramalama/model.py
@@ -98,12 +98,14 @@ def _image(self, args):
         if args.image != default_image():
             return args.image
 
-        gpu_type, _ = get_gpu()
-        if gpu_type == "HIP_VISIBLE_DEVICES":
+        if os.getenv("HIP_VISIBLE_DEVICES"):
             return "quay.io/ramalama/rocm:latest"
 
-        if gpu_type == "ASAHI_VISIBLE_DEVICES":
+        if os.getenv("ASAHI_VISIBLE_DEVICES"):
             return "quay.io/ramalama/asahi:latest"
+
+        if os.getenv("CUDA_VISIBLE_DEVICES"):
+            return "docker.io/brianmahabir/rama-cuda:v1"
 
         return args.image
 
@@ -143,9 +145,15 @@ def setup_container(self, args):
         if os.path.exists("/dev/kfd"):
             conman_args += ["--device", "/dev/kfd"]
 
-        gpu_type, gpu_num = get_gpu()
-        if gpu_type == "HIP_VISIBLE_DEVICES" or gpu_type == "ASAHI_VISIBLE_DEVICES":
-            conman_args += ["-e", f"{gpu_type}={gpu_num}"]
+        for var in ["HIP_VISIBLE_DEVICES", "ASAHI_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]:
+            value = os.getenv(var)
+            if value:
+                if var == "CUDA_VISIBLE_DEVICES":
+                    # Special handling for CUDA (e.g., using '--gpus all')
+                    conman_args += ["--gpus", "all"]
+                else:
+                    # For HIP and ASAHI, we directly add the environment variable with its value
+                    conman_args += ["-e", f"{var}={value}"]
         return conman_args
 
     def run_container(self, args, shortnames):
@@ -190,14 +198,14 @@ def cleanup():
         return True
 
     def gpu_args(self):
+        gpu_type, gpu_num = get_gpu()
         gpu_args = []
         if sys.platform == "darwin":
             # llama.cpp will default to the Metal backend on macOS, so we don't need
             # any additional arguments.
             pass
-        elif sys.platform == "linux" and (
-            os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("ASAHI_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES")
-        ):
+        elif sys.platform == "linux" and gpu_type is not None:
+            os.environ[gpu_type] = gpu_num
             gpu_args = ["-ngl", "99"]
         else:
             print("GPU offload was requested but is not available on this system")
@@ -384,25 +392,52 @@ def check_valid_model_path(self, relative_target_path, model_path):
 
 def get_gpu():
     i = 0
-    gpu_num = 0
-    gpu_bytes = 0
+    amd_gpu_num = -1
+    amd_gpu_bytes = 0
+    nvidia_gpu_num = -1
+    nvidia_gpu_mib = 0
+
+    # Check for AMD GPUs (ROCm/AMD case)
     for fp in sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total')):
         with open(fp, 'r') as file:
             content = int(file.read())
-            if content > 1073741824 and content > gpu_bytes:
-                gpu_bytes = content
-                gpu_num = i
+            if content > 1073741824 and content > amd_gpu_bytes:
+                amd_gpu_bytes = content
+                amd_gpu_num = i
 
         i += 1
 
-    if gpu_bytes:  # this is the ROCm/AMD case
-        return "HIP_VISIBLE_DEVICES", gpu_num
-
+    # Check if system is running Asahi Linux (Apple Silicon)
     if os.path.exists('/etc/os-release'):
         with open('/etc/os-release', 'r') as file:
             content = file.read()
             if "asahi" in content.lower():
-                return "ASAHI_VISIBLE_DEVICES", 1
+                return "ASAHI_VISIBLE_DEVICES", 1  # For Apple Silicon with Asahi Linux
+
+    # Check for NVIDIA GPUs (CUDA case)
+    try:
+        command = ['nvidia-smi', '--query-gpu=index,memory.total', '--format=csv,noheader,nounits']
+        output = run_cmd(command)
+        gpus = output.stdout.strip().split('\n')
+        gpus_sorted = sorted(gpus, key=lambda x: int(x.split(',')[1]), reverse=True)
+        nvidia_gpu_mib = int(gpus_sorted[0].split(',')[1])
+        nvidia_gpu_num = gpus_sorted[0].split(',')[0]
+
+        # Compare AMD and NVIDIA GPUs
+        if amd_gpu_bytes and nvidia_gpu_mib:
+            # bytes to MiB
+            amd_gpu_mib = amd_gpu_bytes / 1048576
+            if amd_gpu_mib > nvidia_gpu_mib:
+                return "HIP_VISIBLE_DEVICES", amd_gpu_num
+            else:
+                return "CUDA_VISIBLE_DEVICES", nvidia_gpu_num
+        elif amd_gpu_bytes:
+            return "HIP_VISIBLE_DEVICES", amd_gpu_num
+        elif nvidia_gpu_mib:
+            return "CUDA_VISIBLE_DEVICES", nvidia_gpu_num
+
+    except Exception:
+        pass  # If no NVIDIA GPU is found or there's an error
 
     return None, None