From c9c777d959b829303e7ca55e9a195d4d27cb35d9 Mon Sep 17 00:00:00 2001
From: Alexey Korepanov <kaikaikai@yandex.ru>
Date: Sat, 1 Feb 2025 12:47:09 +0000
Subject: [PATCH] add --keep-groups and --ngl options

Signed-off-by: Alexey Korepanov <kaikaikai@yandex.ru>
---
 docs/ramalama.1.md      |  7 +++++++
 docs/ramalama.conf      |  9 +++++++++
 docs/ramalama.conf.5.md |  9 +++++++++
 ramalama/cli.py         | 15 +++++++++++++++
 ramalama/model.py       | 16 +++++++++-------
 5 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
index 425f2b3a..0fdf47dc 100644
--- a/docs/ramalama.1.md
+++ b/docs/ramalama.1.md
@@ -109,6 +109,13 @@ The default can be overridden in the ramalama.conf file or via the the
 RAMALAMA_IMAGE environment variable. `export RAMALAMA_TRANSPORT=quay.io/ramalama/aiimage:latest` tells
 RamaLama to use the `quay.io/ramalama/aiimage:latest` image.
 
+#### **--keep-groups**
+pass --group-add keep-groups to podman (default: False)
+Needed to access the gpu on some systems, but has an impact on security, use with caution.
+
+#### **--ngl**
+number of gpu layers (default: 999)
+
 #### **--nocontainer**
 do not run RamaLama in the default container (default: False)
 The default can be overridden in the ramalama.conf file.
diff --git a/docs/ramalama.conf b/docs/ramalama.conf
index 1d1e2a4b..e8c18d73 100644
--- a/docs/ramalama.conf
+++ b/docs/ramalama.conf
@@ -44,6 +44,15 @@
 #
 #host = "0.0.0.0"
 
+# Pass `--group-add keep-groups` to podman, when using podman.
+# In some cases this is needed to access the gpu from a rootless container
+#
+#keep_groups = false
+
+# Default number of layers offloaded to the gpu
+#
+#ngl = 999
+
 # Specify default port for services to listen on
 #
 #port = "8080"
diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
index 747f93cb..9924c599 100644
--- a/docs/ramalama.conf.5.md
+++ b/docs/ramalama.conf.5.md
@@ -86,6 +86,15 @@ IP address for llama.cpp to listen on.
 OCI container image to run with the specified AI model
 RAMALAMA_IMAGE environment variable overrides this field.
 
+**keep_groups**=false
+
+Pass `--group-add keep-groups` to podman, when using podman.
+In some cases this is needed to access the gpu from a rootless container
+
+**ngl**=999
+
+Default number of layers to offload to the gpu
+
 **port**="8080"
 
 Specify default port for services to listen on
diff --git a/ramalama/cli.py b/ramalama/cli.py
index 0b3577b4..2ff2f911 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -189,6 +189,21 @@ def configure_arguments(parser):
         action="store_true",
         help="offload the workload to the GPU",
     )
+    parser.add_argument(
+        "--ngl",
+        dest="ngl",
+        type=int,
+        default=config.get("ngl", 999),
+        help="Number of layers to offload to the gpu, if available"
+    )
+    parser.add_argument(
+        "--keep-groups",
+        dest="podman_keep_groups",
+        default=config.get("keep_groups", False),
+        action="store_true",
+        help="""pass `--group-add keep-groups` to podman, if using podman.
+Needed to access gpu on some systems, but has security implications.""",
+    )
     parser.add_argument(
         "--image",
         default=config.get("image"),
diff --git a/ramalama/model.py b/ramalama/model.py
index 710a27ac..e114806f 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -160,6 +160,8 @@ def setup_container(self, args):
 
         if os.path.basename(args.engine) == "podman":
             conman_args += ["--pull=newer"]
+            if args.podman_keep_groups:
+                conman_args += ["--group-add", "keep-groups"]
         elif os.path.basename(args.engine) == "docker":
             try:
                 run_cmd([args.engine, "pull", "-q", args.image], ignore_all=True)
@@ -188,10 +190,10 @@ def setup_container(self, args):
             conman_args += ["-e", f"{k}={v}"]
         return conman_args
 
-    def gpu_args(self, force=False, runner=False):
+    def gpu_args(self, args, runner=False):
         gpu_args = []
         if (
-            force
+            args.gpu
             or os.getenv("HIP_VISIBLE_DEVICES")
             or os.getenv("ASAHI_VISIBLE_DEVICES")
             or os.getenv("CUDA_VISIBLE_DEVICES")
@@ -206,7 +208,7 @@ def gpu_args(self, force=False, runner=False):
             else:
                 gpu_args += ["-ngl"]  # single dash
 
-            gpu_args += ["999"]
+            gpu_args += [ f'{args.ngl}' ]
 
         return gpu_args
 
@@ -256,7 +258,7 @@ def build_exec_args_perplexity(self, args, model_path):
         exec_args = ["llama-perplexity"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu)
+        gpu_args = self.gpu_args(args=args)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -295,7 +297,7 @@ def build_exec_args_bench(self, args, model_path):
         exec_args = ["llama-bench"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu)
+        gpu_args = self.gpu_args(args=args)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -314,7 +316,7 @@ def build_exec_args_run(self, args, model_path, prompt):
             exec_args += ["-v"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu, runner=True)
+        gpu_args = self.gpu_args(args=args, runner=True)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -379,7 +381,7 @@ def handle_runtime(self, args, exec_args, exec_model_path):
             exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
         else:
             get_gpu()
-            gpu_args = self.gpu_args(force=args.gpu)
+            gpu_args = self.gpu_args(args=args)
             if gpu_args is not None:
                 exec_args.extend(gpu_args)
             exec_args.extend(["--host", args.host])