Add llama-cpp-python server

Changed default runtime from 'llama.cpp' to 'llama-cpp-python'. Added 'llama-cpp-python' as a runtime option for better flexibility with the `--runtime` flag. Signed-off-by: Eric Curtin <[email protected]>
containers · Dec 19, 2024 · 0554bbf · 0554bbf
1 parent 307628e
commit 0554bbf
Show file tree

Hide file tree

Showing 13 changed files with 74 additions and 64 deletions.
diff --git a/container-images/asahi/Containerfile b/container-images/asahi/Containerfile
@@ -1,11 +1,11 @@
 FROM fedora:41
 
-ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \
     /scripts/build_llama_and_whisper.sh "asahi" "$LLAMA_CPP_SHA" \
-      "$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
+      "$WHISPER_CPP_SHA"
 
diff --git a/container-images/cuda/Containerfile b/container-images/cuda/Containerfile
@@ -1,15 +1,14 @@
 # Base image with CUDA for compilation
 FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder
 
-ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \
     /scripts/build_llama_and_whisper.sh "cuda" "$LLAMA_CPP_SHA" \
-      "$WHISPER_CPP_SHA" "/tmp/install" \
-      "-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined"
+      "$WHISPER_CPP_SHA"
 
 # Final runtime image
 FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9

diff --git a/container-images/ramalama/Containerfile b/container-images/ramalama/Containerfile
@@ -1,13 +1,13 @@
 FROM registry.access.redhat.com/ubi9/ubi:9.4-1214.1729773476
 
-ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
+ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5
 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
-ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
+ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467
 
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \
     /scripts/build_llama_and_whisper.sh "ramalama" "$LLAMA_CPP_SHA" \
-      "$WHISPER_CPP_SHA" "/usr" "-DGGML_KOMPUTE=1"
+      "$WHISPER_CPP_SHA"
 
 ENV WHISPER_CPP_SHA=${WHISPER_CPP_SHA}
 ENV LLAMA_CPP_SHA=${LLAMA_CPP_SHA}
diff --git a/container-images/rocm/Containerfile b/container-images/rocm/Containerfile
@@ -8,5 +8,5 @@ COPY rocm/rocm.repo /etc/yum.repos.d/
 COPY scripts /scripts
 RUN chmod +x /scripts/*.sh && \
     /scripts/build_llama_and_whisper.sh "rocm" "$LLAMA_CPP_SHA" \
-      "$WHISPER_CPP_SHA" "/usr" "-DGGML_HIPBLAS=1"
+      "$WHISPER_CPP_SHA"
 
diff --git a/container-images/scripts/build_llama_and_whisper.sh b/container-images/scripts/build_llama_and_whisper.sh
@@ -30,7 +30,8 @@ dnf_install() {
     dnf install -y rocm-dev hipblas-devel rocblas-devel
   elif [ "$containerfile" = "cuda" ]; then
     dnf install -y "${rpm_list[@]}" gcc-toolset-12
-    source /opt/rh/gcc-toolset-12/enable
+    # shellcheck disable=SC1091
+    . /opt/rh/gcc-toolset-12/enable
   fi
 
   # For Vulkan image, we don't need to install anything extra but rebuild with
@@ -39,28 +40,42 @@ dnf_install() {
 
 cmake_steps() {
   local flag="$1"
-  cmake -B build "${common_flags[@]}" "$flag"
+  cmake -B build "${cpp_flags[@]}" "$flag"
   cmake --build build --config Release -j"$(nproc)"
   cmake --install build
 }
 
+set_install_prefix() {
+  if [ "$containerfile" = "cuda" ]; then
+    install_prefix="/tmp/install"
+  else
+    install_prefix="/usr"
+  fi
+}
+
 main() {
   set -e
 
   local containerfile="$1"
   local llama_cpp_sha="$2"
   local whisper_cpp_sha="$3"
-  local install_prefix="$4"
-  local build_flag_1="$5"
-  local build_flag_2="$6"
-  local common_flags=("-DGGML_CCACHE=0" \
-                      "-DCMAKE_INSTALL_PREFIX=$install_prefix" "$build_flag_1")
-  if [ -n "$build_flag_2" ]; then
-    common_flags+=("$build_flag_2")
+  local install_prefix
+  set_install_prefix
+  local common_flags
+  if [ "$containerfile" = "ramalama" ]; then
+    common_flags=("-DGGML_KOMPUTE=1")
+  elif [ "$containerfile" = "rocm" ]; then
+    common_flags=("-DGGML_HIPBLAS=1")
+  elif [ "$containerfile" = "cuda" ]; then
+    common_flags=("-DGGML_NATIVE=OFF" "-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined")
+  elif [ "$containerfile" = "vulkan" ] || [ "$containerfile" = "asahi" ]; then
+    common_flags=("-DGGML_VULKAN=1")
   fi
 
+  local cpp_flags=("${common_flags[@]}")
+  cpp_flags+=("-DGGML_CCACHE=0" \
+             "-DCMAKE_INSTALL_PREFIX=$install_prefix")
   dnf_install
-
   git clone https://github.com/ggerganov/llama.cpp
   cd llama.cpp
   git reset --hard "$llama_cpp_sha"
@@ -75,9 +90,7 @@ main() {
   mv build/bin/server "$install_prefix/bin/whisper-server"
   cd ..
 
-  CMAKE_ARGS="${common_flags[*]}" FORCE_CMAKE=1 \
-    pip install --prefix="$install_prefix" 'llama-cpp-python[server]'
-
+  CMAKE_ARGS="${common_flags[*]}" pip install "llama-cpp-python[server]"
   dnf clean all
   rm -rf /var/cache/*dnf* /opt/rocm-*/lib/llvm \
     /opt/rocm-*/lib/rocblas/library/*gfx9* llama.cpp whisper.cpp

diff --git a/container-images/vulkan/Containerfile b/container-images/vulkan/Containerfile
@@ -3,5 +3,5 @@ FROM quay.io/ramalama/ramalama:latest
 COPY ../scripts /scripts
 RUN chmod +x /scripts/*.sh && \
     /scripts/build_llama_and_whisper.sh "vulkan" "$LLAMA_CPP_SHA" \
-      "$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
+      "$WHISPER_CPP_SHA"
 
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -95,7 +95,7 @@ After=local-fs.target
 [Container]
 AddDevice=-/dev/dri
 AddDevice=-/dev/kfd
-Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
+Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
 Image=quay.io/ramalama/ramalama:latest
 Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z
 ContainerName=MyGraniteServer

diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -215,6 +215,7 @@ def configure_arguments(parser):
     )
     parser.add_argument("-v", "--version", dest="version", action="store_true", help="show RamaLama version")
 
+
 def configure_subcommands(parser):
     """Add subcommand parsers to the main argument parser."""
     subparsers = parser.add_subparsers(dest="subcommand")

diff --git a/ramalama/common.py b/ramalama/common.py
@@ -59,11 +59,6 @@ def exec_cmd(args, stderr=True, debug=False):
     if debug:
         perror("exec_cmd: ", *args)
 
-    if not stderr:
-        # Redirecting stderr to /dev/null
-        with open(os.devnull, "w") as devnull:
-            os.dup2(devnull.fileno(), sys.stderr.fileno())
-
     try:
         return os.execvp(args[0], args)
     except Exception:

diff --git a/ramalama/model.py b/ramalama/model.py
@@ -254,36 +254,22 @@ def run(self, args):
         if not args.container:
             exec_model_path = model_path
 
-        exec_args = [
-            "llama-cli",
-            "-m",
-            exec_model_path,
-            "--in-prefix",
-            "",
-            "--in-suffix",
-            "",
-            "-c",
-            f"{args.context}",
-            "--temp",
-            f"{args.temp}",
-        ]
+        exec_args = ["llama-run", "-c", f"{args.context}", "--temp" f"{args.temp}"]
 
         if args.seed:
             exec_args += ["--seed", args.seed]
 
-        if not args.debug:
-            exec_args += ["--no-display-prompt"]
-        exec_args += [
-            "-p",
-            prompt,
-        ]
-
-        if not args.ARGS and sys.stdin.isatty():
-            exec_args.append("-cnv")
+        if args.debug:
+            exec_args += ["-v"]
 
         if args.gpu:
             exec_args.extend(self.gpu_args())
 
+        exec_args += [
+            exec_model_path,
+            prompt,
+        ]
+
         try:
             if self.exec_model_in_container(model_path, exec_args, args):
                 return
@@ -315,22 +301,23 @@ def serve(self, args):
             exec_model_path = model_path
 
         exec_args = [
-            "llama-server",
+            "python3",
+            "-m",
+            "llama_cpp.server",
             "--port",
             args.port,
-            "-m",
+            "--model",
             exec_model_path,
-            "-c",
-            f"{args.context}",
+            "--host",
+            args.host,
             "--temp",
             f"{args.temp}",
         ]
         if args.seed:
             exec_args += ["--seed", args.seed]
 
         if args.runtime == "vllm":
-            if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")):
-                exec_model_path = os.path.dirname(exec_model_path)
+            exec_model_path = os.path.dirname(exec_model_path)
             exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
         else:
             if args.gpu:

diff --git a/scripts/replace-shas.sh b/scripts/replace-shas.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+
+find_files() {
+  grep -rl "$1_CPP_SHA=" container-images/
+}
+
+sed_files() {
+  xargs sed -i "s/ARG $1_CPP_SHA=.*/ARG $1_CPP_SHA=$2/g"
+}
+
+find_files "LLAMA" | sed_files "LLAMA" "$1"
+find_files "WHISPER" | sed_files "WHISPER" "$2"
+
diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -34,7 +34,7 @@ load helpers
 	is "$output" ".*${image} /bin/sh -c" "verify image name"
     else
 	run_ramalama --dryrun run -c 4096 ${model}
-	is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix -c 4096 --temp 0.8 --no-display-prompt -p.*' "dryrun correct"
+	is "$output" 'llama-run -c 4096 --temp 0.8 /path/to/model' "dryrun correct"
 	is "$output" ".*-c 4096" "verify model name"
 
 	run_ramalama 1 run --ctx-size=4096 --name foobar tiny

diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -145,7 +145,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat tinyllama.container
     is "$output" ".*PublishPort=1234" "PublishPort should match"
-    is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
+    is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
     is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct"
 
     rm tinyllama.container
@@ -183,7 +183,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 	run cat $name.container
 	is "$output" ".*PublishPort=1234" "PublishPort should match"
 	is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field"
-	is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
+	is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
 	is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct"
 
 	if is_container; then
@@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat $name.yaml
     is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
-    is "$output" ".*command: \[\"llama-server\"\]" "Should command"
+    is "$output" ".*command: \[\"python3\"\]" "Should command"
     is "$output" ".*containerPort: 1234" "Should container container port"
 
     run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model}
@@ -244,7 +244,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 
     run cat $name.yaml
     is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
-    is "$output" ".*command: \[\"llama-server\"\]" "Should command"
+    is "$output" ".*command: \[\"python3\"\]" "Should command"
     is "$output" ".*containerPort: 1234" "Should container container port"
 
     run cat $name.kube