Skip to content

Commit

Permalink
Fixed gpu detection for cuda rocm etc using env vars
Browse files Browse the repository at this point in the history
Signed-off-by: Brian <[email protected]>
  • Loading branch information
bmahabirbu committed Nov 25, 2024
1 parent 590fe3e commit a7a8309
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 19 deletions.
2 changes: 1 addition & 1 deletion ramalama/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def run_cmd(args, cwd=None, stdout=subprocess.PIPE, ignore_stderr=False, debug=F
if ignore_stderr:
stderr = subprocess.PIPE

return subprocess.run(args, check=True, cwd=cwd, stdout=stdout, stderr=stderr)
return subprocess.run(args, check=True, cwd=cwd, stdout=stdout, stderr=stderr, text=True)


def find_working_directory():
Expand Down
71 changes: 53 additions & 18 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,14 @@ def _image(self, args):
if args.image != default_image():
return args.image

gpu_type, _ = get_gpu()
if gpu_type == "HIP_VISIBLE_DEVICES":
if os.getenv("HIP_VISIBLE_DEVICES"):
return "quay.io/ramalama/rocm:latest"

if gpu_type == "ASAHI_VISIBLE_DEVICES":
if os.getenv("ASAHI_VISIBLE_DEVICES"):
return "quay.io/ramalama/asahi:latest"

if os.getenv("CUDA_VISIBLE_DEVICES"):
return "docker.io/brianmahabir/rama-cuda:v1"

return args.image

Expand Down Expand Up @@ -143,9 +145,15 @@ def setup_container(self, args):
if os.path.exists("/dev/kfd"):
conman_args += ["--device", "/dev/kfd"]

gpu_type, gpu_num = get_gpu()
if gpu_type == "HIP_VISIBLE_DEVICES" or gpu_type == "ASAHI_VISIBLE_DEVICES":
conman_args += ["-e", f"{gpu_type}={gpu_num}"]
for var in ["HIP_VISIBLE_DEVICES", "ASAHI_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]:
value = os.getenv(var)
if value:
if var == "CUDA_VISIBLE_DEVICES":
# Special handling for CUDA (e.g., using '--gpus all')
conman_args += ["--gpus", "all"]
else:
# For HIP and ASAHI, we directly add the environment variable with its value
conman_args += ["-e", f"{var}={value}"]
return conman_args

def run_container(self, args, shortnames):
Expand Down Expand Up @@ -190,14 +198,14 @@ def cleanup():
return True

def gpu_args(self):
gpu_type, gpu_num = get_gpu()
gpu_args = []
if sys.platform == "darwin":
# llama.cpp will default to the Metal backend on macOS, so we don't need
# any additional arguments.
pass
elif sys.platform == "linux" and (
os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("ASAHI_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES")
):
elif sys.platform == "linux" and gpu_type != None:
os.environ[gpu_type] = gpu_num
gpu_args = ["-ngl", "99"]
else:
print("GPU offload was requested but is not available on this system")
Expand Down Expand Up @@ -384,25 +392,52 @@ def check_valid_model_path(self, relative_target_path, model_path):

def get_gpu():
i = 0
gpu_num = 0
gpu_bytes = 0
amd_gpu_num = -1
amd_gpu_bytes = 0
nvidia_gpu_num = -1
nvidia_gpu_mib = 0

# Check for AMD GPUs (ROCm/AMD case)
for fp in sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total')):
with open(fp, 'r') as file:
content = int(file.read())
if content > 1073741824 and content > gpu_bytes:
gpu_bytes = content
gpu_num = i
if content > 1073741824 and content > amd_gpu_bytes:
amd_gpu_bytes = content
amd_gpu_num = i

i += 1

if gpu_bytes: # this is the ROCm/AMD case
return "HIP_VISIBLE_DEVICES", gpu_num

# Check if system is running Asahi Linux (Apple Silicon)
if os.path.exists('/etc/os-release'):
with open('/etc/os-release', 'r') as file:
content = file.read()
if "asahi" in content.lower():
return "ASAHI_VISIBLE_DEVICES", 1
return "ASAHI_VISIBLE_DEVICES", 1 # For Apple Silicon with Asahi Linux

# Check for NVIDIA GPUs (CUDA case)
try:
command = ['nvidia-smi', '--query-gpu=index,memory.total', '--format=csv,noheader,nounits']
output = run_cmd(command)
gpus = output.stdout.strip().split('\n')
gpus_sorted = sorted(gpus, key=lambda x: int(x.split(',')[1]), reverse=True)
nvidia_gpu_mib = int(gpus_sorted[0].split(',')[1])
nvidia_gpu_num = gpus_sorted[0].split(',')[0]

# Compare AMD and NVIDIA GPUs
if amd_gpu_bytes and nvidia_gpu_mib:
# bytes to MiB
amd_gpu_mib = amd_gpu_bytes / 1048576
if amd_gpu_mib > nvidia_gpu_mib:
return "HIP_VISIBLE_DEVICES", amd_gpu_num
else:
return "CUDA_VISIBLE_DEVICES", nvidia_gpu_num
elif amd_gpu_bytes:
return "HIP_VISIBLE_DEVICES", amd_gpu_num
elif nvidia_gpu_mib:
return "CUDA_VISIBLE_DEVICES", nvidia_gpu_num

except Exception:
pass # If no NVIDIA GPU is found or there's an error

return None, None

Expand Down

0 comments on commit a7a8309

Please sign in to comment.