From c9c777d959b829303e7ca55e9a195d4d27cb35d9 Mon Sep 17 00:00:00 2001 From: Alexey Korepanov Date: Sat, 1 Feb 2025 12:47:09 +0000 Subject: [PATCH] add --keep-groups and --ngl options Signed-off-by: Alexey Korepanov --- docs/ramalama.1.md | 7 +++++++ docs/ramalama.conf | 9 +++++++++ docs/ramalama.conf.5.md | 9 +++++++++ ramalama/cli.py | 15 +++++++++++++++ ramalama/model.py | 16 +++++++++------- 5 files changed, 49 insertions(+), 7 deletions(-) diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md index 425f2b3a..0fdf47dc 100644 --- a/docs/ramalama.1.md +++ b/docs/ramalama.1.md @@ -109,6 +109,13 @@ The default can be overridden in the ramalama.conf file or via the the RAMALAMA_IMAGE environment variable. `export RAMALAMA_TRANSPORT=quay.io/ramalama/aiimage:latest` tells RamaLama to use the `quay.io/ramalama/aiimage:latest` image. +#### **--keep-groups** +pass --group-add keep-groups to podman (default: False) +Needed to access the gpu on some systems, but has an impact on security, use with caution. + +#### **--ngl** +number of gpu layers (default: 999) + #### **--nocontainer** do not run RamaLama in the default container (default: False) The default can be overridden in the ramalama.conf file. diff --git a/docs/ramalama.conf b/docs/ramalama.conf index 1d1e2a4b..e8c18d73 100644 --- a/docs/ramalama.conf +++ b/docs/ramalama.conf @@ -44,6 +44,15 @@ # #host = "0.0.0.0" +# Pass `--group-add keep-groups` to podman, when using podman. +# In some cases this is needed to access the gpu from a rootless container +# +#keep_groups = false + +# Default number of layers offloaded to the gpu +# +#ngl = 999 + # Specify default port for services to listen on # #port = "8080" diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md index 747f93cb..9924c599 100644 --- a/docs/ramalama.conf.5.md +++ b/docs/ramalama.conf.5.md @@ -86,6 +86,15 @@ IP address for llama.cpp to listen on. OCI container image to run with the specified AI model RAMALAMA_IMAGE environment variable overrides this field. +**keep_groups**=false + +Pass `--group-add keep-groups` to podman, when using podman. +In some cases this is needed to access the gpu from a rootless container + +**ngl**=999 + +Default number of layers to offload to the gpu + **port**="8080" Specify default port for services to listen on diff --git a/ramalama/cli.py b/ramalama/cli.py index 0b3577b4..2ff2f911 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -189,6 +189,21 @@ def configure_arguments(parser): action="store_true", help="offload the workload to the GPU", ) + parser.add_argument( + "--ngl", + dest="ngl", + type=int, + default=config.get("ngl", 999), + help="Number of layers to offload to the gpu, if available" + ) + parser.add_argument( + "--keep-groups", + dest="podman_keep_groups", + default=config.get("keep_groups", False), + action="store_true", + help="""pass `--group-add keep-groups` to podman, if using podman. +Needed to access gpu on some systems, but has security implications.""", + ) parser.add_argument( "--image", default=config.get("image"), diff --git a/ramalama/model.py b/ramalama/model.py index 710a27ac..e114806f 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -160,6 +160,8 @@ def setup_container(self, args): if os.path.basename(args.engine) == "podman": conman_args += ["--pull=newer"] + if args.podman_keep_groups: + conman_args += ["--group-add", "keep-groups"] elif os.path.basename(args.engine) == "docker": try: run_cmd([args.engine, "pull", "-q", args.image], ignore_all=True) @@ -188,10 +190,10 @@ def setup_container(self, args): conman_args += ["-e", f"{k}={v}"] return conman_args - def gpu_args(self, force=False, runner=False): + def gpu_args(self, args, runner=False): gpu_args = [] if ( - force + args.gpu or os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("ASAHI_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES") @@ -206,7 +208,7 @@ def gpu_args(self, force=False, runner=False): else: gpu_args += ["-ngl"] # single dash - gpu_args += ["999"] + gpu_args += [ f'{args.ngl}' ] return gpu_args @@ -256,7 +258,7 @@ def build_exec_args_perplexity(self, args, model_path): exec_args = ["llama-perplexity"] get_gpu() - gpu_args = self.gpu_args(force=args.gpu) + gpu_args = self.gpu_args(args=args) if gpu_args is not None: exec_args.extend(gpu_args) @@ -295,7 +297,7 @@ def build_exec_args_bench(self, args, model_path): exec_args = ["llama-bench"] get_gpu() - gpu_args = self.gpu_args(force=args.gpu) + gpu_args = self.gpu_args(args=args) if gpu_args is not None: exec_args.extend(gpu_args) @@ -314,7 +316,7 @@ def build_exec_args_run(self, args, model_path, prompt): exec_args += ["-v"] get_gpu() - gpu_args = self.gpu_args(force=args.gpu, runner=True) + gpu_args = self.gpu_args(args=args, runner=True) if gpu_args is not None: exec_args.extend(gpu_args) @@ -379,7 +381,7 @@ def handle_runtime(self, args, exec_args, exec_model_path): exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"] else: get_gpu() - gpu_args = self.gpu_args(force=args.gpu) + gpu_args = self.gpu_args(args=args) if gpu_args is not None: exec_args.extend(gpu_args) exec_args.extend(["--host", args.host])