containers · rhatdan · Feb 2, 2025 · Feb 1, 2025 · rhatdan · Jan 30, 2025
@@ -109,6 +109,13 @@ The default can be overridden in the ramalama.conf file or via the the
 RAMALAMA_IMAGE environment variable. `export RAMALAMA_TRANSPORT=quay.io/ramalama/aiimage:latest` tells
 RamaLama to use the `quay.io/ramalama/aiimage:latest` image.
 
+#### **--keep-groups**
+pass --group-add keep-groups to podman (default: False)
+Needed to access the gpu on some systems, but has an impact on security, use with caution.
-#### **--keep-groups**
-pass --group-add keep-groups to podman (default: False)
-Needed to access the gpu on some systems, but has an impact on security, use with caution.
+#### **--keep-groups**
+pass --group-add keep-groups to podman (default: False)
+Needed to access the gpu on some systems, but has significant security implications:
+
+- Preserves the host user's supplementary group memberships inside the container
+- Could grant the container unnecessary elevated privileges through group memberships (e.g., disk, docker, sudo)
+- May allow container processes to access host system resources that share the same group permissions
+- Breaks container isolation principles by sharing host's security context
+
+Only use this flag if GPU access cannot be achieved through more secure methods like specific device mapping.
-#### **--keep-groups**
-pass --group-add keep-groups to podman (default: False)
-Needed to access the gpu on some systems, but has an impact on security, use with caution.
+#### **--keep-groups**
+pass --group-add keep-groups to podman (default: False)
+Needed to access the gpu on some systems, but has significant security implications:
+
+- Preserves the host user's supplementary group memberships inside the container
+- Could grant the container unnecessary elevated privileges through group memberships (e.g., disk, docker, sudo)
+- May allow container processes to access host system resources that share the same group permissions
+- Breaks container isolation principles by sharing host's security context
+
+Only use this flag if GPU access cannot be achieved through more secure methods like specific device mapping.
+
+#### **--ngl**
+number of gpu layers (default: 999)
+
 #### **--nocontainer**
 do not run RamaLama in the default container (default: False)
 The default can be overridden in the ramalama.conf file.

@@ -44,6 +44,15 @@
 #
 #host = "0.0.0.0"
 
+# Pass `--group-add keep-groups` to podman, when using podman.
+# In some cases this is needed to access the gpu from a rootless container
+#
+#keep_groups = false
+
+# Default number of layers offloaded to the gpu
+#
+#ngl = 999
+
 # Specify default port for services to listen on
 #
 #port = "8080"

@@ -86,6 +86,15 @@ IP address for llama.cpp to listen on.
 OCI container image to run with the specified AI model
 RAMALAMA_IMAGE environment variable overrides this field.
 
+**keep_groups**=false
+
+Pass `--group-add keep-groups` to podman, when using podman.
+In some cases this is needed to access the gpu from a rootless container
+
+**ngl**=999
+
+Default number of layers to offload to the gpu
+
 **port**="8080"
 
 Specify default port for services to listen on

@@ -189,6 +189,21 @@ def configure_arguments(parser):
         action="store_true",
         help="offload the workload to the GPU",
     )
+    parser.add_argument(
+        "--ngl",
+        dest="ngl",
+        type=int,
+        default=config.get("ngl", 999),
+        help="Number of layers to offload to the gpu, if available"
+    )
+    parser.add_argument(
+        "--keep-groups",
+        dest="podman_keep_groups",
+        default=config.get("keep_groups", False),
+        action="store_true",
+        help="""pass `--group-add keep-groups` to podman, if using podman.
+Needed to access gpu on some systems, but has security implications.""",
+    )
     parser.add_argument(
         "--image",
         default=config.get("image"),

@@ -160,6 +160,8 @@ def setup_container(self, args):
 
         if os.path.basename(args.engine) == "podman":
             conman_args += ["--pull=newer"]
+            if args.podman_keep_groups:
+                conman_args += ["--group-add", "keep-groups"]
         elif os.path.basename(args.engine) == "docker":
             try:
                 run_cmd([args.engine, "pull", "-q", args.image], ignore_all=True)
@@ -188,10 +190,10 @@ def setup_container(self, args):
             conman_args += ["-e", f"{k}={v}"]
         return conman_args
 
-    def gpu_args(self, force=False, runner=False):
+    def gpu_args(self, args, runner=False):
         gpu_args = []
         if (
-            force
+            args.gpu
             or os.getenv("HIP_VISIBLE_DEVICES")
             or os.getenv("ASAHI_VISIBLE_DEVICES")
             or os.getenv("CUDA_VISIBLE_DEVICES")
@@ -206,7 +208,7 @@ def gpu_args(self, force=False, runner=False):
             else:
                 gpu_args += ["-ngl"]  # single dash
 
-            gpu_args += ["999"]
+            gpu_args += [ f'{args.ngl}' ]
 
         return gpu_args
 
@@ -256,7 +258,7 @@ def build_exec_args_perplexity(self, args, model_path):
         exec_args = ["llama-perplexity"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu)
+        gpu_args = self.gpu_args(args=args)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -295,7 +297,7 @@ def build_exec_args_bench(self, args, model_path):
         exec_args = ["llama-bench"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu)
+        gpu_args = self.gpu_args(args=args)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -314,7 +316,7 @@ def build_exec_args_run(self, args, model_path, prompt):
             exec_args += ["-v"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu, runner=True)
+        gpu_args = self.gpu_args(args=args, runner=True)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -379,7 +381,7 @@ def handle_runtime(self, args, exec_args, exec_model_path):
             exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
         else:
             get_gpu()
-            gpu_args = self.gpu_args(force=args.gpu)
+            gpu_args = self.gpu_args(args=args)
             if gpu_args is not None:
                 exec_args.extend(gpu_args)
             exec_args.extend(["--host", args.host])