Add llama-cpp-python server

Changed default runtime from 'llama.cpp' to 'llama-cpp-python'. Added 'llama-cpp-python' as a runtime option for better flexibility with the `--runtime` flag. Signed-off-by: Eric Curtin <[email protected]>
containers · Dec 19, 2024 · acb465f · acb465f
1 parent 307628e
commit acb465f
Show file tree

Hide file tree

Showing 9 changed files with 61 additions and 18 deletions.
diff --git a/container-images/asahi/Containerfile b/container-images/asahi/Containerfile
@@ -1,8 +1,8 @@
 FROM fedora:41
 
-ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \

diff --git a/container-images/cuda/Containerfile b/container-images/cuda/Containerfile
@@ -1,9 +1,9 @@
 # Base image with CUDA for compilation
 FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder
 
-ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \

diff --git a/container-images/ramalama/Containerfile b/container-images/ramalama/Containerfile
@@ -1,8 +1,8 @@
 FROM registry.access.redhat.com/ubi9/ubi:9.4-1214.1729773476
 
-ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \

diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -95,7 +95,7 @@ After=local-fs.target
 [Container]
 AddDevice=-/dev/dri
 AddDevice=-/dev/kfd
-Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
+Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
 Image=quay.io/ramalama/ramalama:latest
 Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z
 ContainerName=MyGraniteServer

diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -103,7 +103,7 @@ def load_and_merge_config():
         )
 
     ramalama_config['carimage'] = ramalama_config.get('carimage', "registry.access.redhat.com/ubi9-micro:latest")
-    ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama.cpp')
+    ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama-cpp-python')
     ramalama_config['store'] = os.getenv('RAMALAMA_STORE', ramalama_config.get('store', get_store()))
     ramalama_config['transport'] = os.getenv('RAMALAMA_TRANSPORT', ramalama_config.get('transport', "ollama"))
 
@@ -205,8 +205,8 @@ def configure_arguments(parser):
     parser.add_argument(
         "--runtime",
         default=config.get("runtime"),
-        choices=["llama.cpp", "vllm"],
-        help="specify the runtime to use; valid options are 'llama.cpp' and 'vllm'",
+        choices=["llama-cpp-python", "llama.cpp", "vllm"],
+        help="specify the runtime to use; valid options are 'llama-cpp-python', 'llama.cpp' and 'vllm'",
     )
     parser.add_argument(
         "--store",

diff --git a/ramalama/model.py b/ramalama/model.py
@@ -298,6 +298,19 @@ def run(self, args):
                 )
             raise NotImplementedError(file_not_found % {"cmd": exec_args[0], "error": str(e).strip("'")})
 
+    def execute_model(self, model_path, exec_args, args):
+        try:
+            if self.exec_model_in_container(model_path, exec_args, args):
+                return
+            if args.dryrun:
+                dry_run(exec_args)
+                return
+            exec_cmd(exec_args, debug=args.debug)
+        except FileNotFoundError as e:
+            if in_container():
+                raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
+            raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))
+
     def serve(self, args):
         if hasattr(args, "name") and args.name:
             if not args.container and not args.generate:
@@ -329,13 +342,28 @@ def serve(self, args):
             exec_args += ["--seed", args.seed]
 
         if args.runtime == "vllm":
-            if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")):
-                exec_model_path = os.path.dirname(exec_model_path)
+            exec_model_path = os.path.dirname(exec_model_path)
             exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
+        elif args.runtime == "llama.cpp":
+            exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path, "--host", args.host]
+
+            if args.gpu:
+                exec_args.extend(self.gpu_args())
         else:
+            exec_args = [
+                "python3",
+                "-m",
+                "llama_cpp.server",
+                "--port",
+                args.port,
+                "--model",
+                exec_model_path,
+                "--host",
+                args.host,
+            ]
+
             if args.gpu:
                 exec_args.extend(self.gpu_args())
-            exec_args.extend(["--host", args.host])
 
         if args.generate == "quadlet":
             return self.quadlet(model_path, args, exec_args)

diff --git a/scripts/replace-shas.sh b/scripts/replace-shas.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+
+find_files() {
+  grep -rl "$1_CPP_SHA=" container-images/
+}
+
+sed_files() {
+  xargs sed -i "s/ARG $1_CPP_SHA=.*/ARG $1_CPP_SHA=$2/g"
+}
+
+find_files "LLAMA" | sed_files "LLAMA" "$1"
+find_files "WHISPER" | sed_files "WHISPER" "$2"
+
diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -145,7 +145,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat tinyllama.container
     is "$output" ".*PublishPort=1234" "PublishPort should match"
-    is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
+    is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
     is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct"
 
     rm tinyllama.container
@@ -183,7 +183,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 	run cat $name.container
 	is "$output" ".*PublishPort=1234" "PublishPort should match"
 	is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field"
-	is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
+	is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
 	is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct"
 
 	if is_container; then
@@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat $name.yaml
     is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
-    is "$output" ".*command: \[\"llama-server\"\]" "Should command"
+    is "$output" ".*command: \[\"python3\"\]" "Should command"
     is "$output" ".*containerPort: 1234" "Should container container port"
 
     run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model}
@@ -244,7 +244,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat $name.yaml
     is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
-    is "$output" ".*command: \[\"llama-server\"\]" "Should command"
+    is "$output" ".*command: \[\"python3\"\]" "Should command"
     is "$output" ".*containerPort: 1234" "Should container container port"
 
     run cat $name.kube

diff --git a/test/system/060-info.bats b/test/system/060-info.bats
@@ -21,7 +21,7 @@ load helpers
     # FIXME Engine  (podman|docker|'')
     tests="
 Image   | "quay.io/ramalama/ramalama:latest"
-Runtime | "llama.cpp"
+Runtime | "llama-cpp-python"
 Version | "${version}"
 Store   | \\\("${HOME}/.local/share/ramalama"\\\|"/var/lib/ramalama"\\\)
 "