containers · rhatdan · Oct 31, 2024 · Oct 31, 2024 · Oct 9, 2024 · Oct 9, 2024
@@ -6,7 +6,7 @@ ARG HUGGINGFACE_HUB_VERSION=0.26.2
 ARG OMLMD_VERSION=0.1.6
 # renovate: datasource=github-releases depName=tqdm/tqdm extractVersion=^v(?<version>.*)
 ARG TQDM_VERSION=4.66.6
-ARG LLAMA_CPP_SHA=3f1ae2e32cde00c39b96be6d01c2997c29bae555
+ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
 ARG WHISPER_CPP_SHA=19dca2bb1464326587cbeb7af00f93c4a59b01fd
 

@@ -89,6 +89,9 @@ show container runtime command without executing it (default: False)
 run RamaLama using the specified container engine. Default is `podman` if installed otherwise docker.
 The default can be overridden in the ramalama.conf file or via the RAMALAMA_CONTAINER_ENGINE environment variable.
 
+#### **--gpu**
+offload the workload to the GPU (default: False)
+
 #### **--help**, **-h**
 show this help message and exit
 

@@ -196,6 +196,13 @@ def configure_arguments(parser):
         help="""do not run RamaLama in the default container.
 The RAMALAMA_IN_CONTAINER environment variable modifies default behaviour.""",
     )
+    parser.add_argument(
+        "--gpu",
+        dest="gpu",
+        default=False,
+        action="store_true",
+        help="offload the workload to the GPU",
+    )
     parser.add_argument(
         "--runtime",
         default=config.get("runtime"),

@@ -37,8 +37,6 @@ class Model:
 
     def __init__(self, model):
         self.model = model
-        if sys.platform == "darwin" or os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES"):
-            self.common_params += ["-ngl", "99"]
 
     def login(self, args):
         raise NotImplementedError(f"ramalama login for {self.type} not implemented")
@@ -146,7 +144,7 @@ def run_container(self, args, shortnames):
         if hasattr(args, "port"):
             conman_args += ["-p", f"{args.port}:{args.port}"]
 
-        if os.path.exists("/dev/dri"):
+        if sys.platform == "darwin" or os.path.exists("/dev/dri"):
             conman_args += ["--device", "/dev/dri"]
 
         if os.path.exists("/dev/kfd"):
@@ -180,6 +178,20 @@ def cleanup():
         run_cmd(conman_args, stdout=None, debug=args.debug)
         return True
 
+    def gpu_args(self):
+        gpu_args = [ ]
+        if sys.platform == "darwin":
+            # llama.cpp will default to the Metal backend on macOS, so we don't need
+            # any additional arguments.
+            pass
+        elif sys.platform == "linux" and (os.path.exists("/dev/dri") or
+              os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES")):
+            gpu_args = ["-ngl", "99"]
+        else:
+            print("GPU offload was requested but is not available on this system")
+
+        return gpu_args
+
     def run(self, args):
         prompt = "You are a helpful assistant"
         if args.ARGS:
@@ -205,6 +217,9 @@ def run(self, args):
         if not args.ARGS and sys.stdin.isatty():
             exec_args.append("-cnv")
 
+        if args.gpu:
+            exec_args.extend(self.gpu_args())
+
         try:
             exec_cmd(exec_args, args.debug, debug=args.debug)
         except FileNotFoundError as e:
@@ -217,6 +232,11 @@ def serve(self, args):
         exec_args = ["llama-server", "--port", args.port, "-m", model_path]
         if args.runtime == "vllm":
             exec_args = ["vllm", "serve", "--port", args.port, model_path]
+        else:
+            if args.gpu:
+                exec_args.extend(self.gpu_args())
+            if in_container():
+                exec_args.extend(["--host", "0.0.0.0"])
 
         if args.generate == "quadlet":
             return self.quadlet(model_path, args, exec_args)