From 0554bbff60ca9c30b0fd0cc7da4b1653814932a1 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Wed, 13 Nov 2024 19:20:15 +0000 Subject: [PATCH] Add llama-cpp-python server Changed default runtime from 'llama.cpp' to 'llama-cpp-python'. Added 'llama-cpp-python' as a runtime option for better flexibility with the `--runtime` flag. Signed-off-by: Eric Curtin --- container-images/asahi/Containerfile | 6 +-- container-images/cuda/Containerfile | 7 ++- container-images/ramalama/Containerfile | 6 +-- container-images/rocm/Containerfile | 2 +- .../scripts/build_llama_and_whisper.sh | 39 +++++++++++------ container-images/vulkan/Containerfile | 2 +- docs/ramalama-serve.1.md | 2 +- ramalama/cli.py | 1 + ramalama/common.py | 5 --- ramalama/model.py | 43 +++++++------------ scripts/replace-shas.sh | 15 +++++++ test/system/030-run.bats | 2 +- test/system/040-serve.bats | 8 ++-- 13 files changed, 74 insertions(+), 64 deletions(-) create mode 100755 scripts/replace-shas.sh diff --git a/container-images/asahi/Containerfile b/container-images/asahi/Containerfile index 3167d964..1c275e4e 100644 --- a/container-images/asahi/Containerfile +++ b/container-images/asahi/Containerfile @@ -1,11 +1,11 @@ FROM fedora:41 -ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78 +ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest -ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d +ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467 COPY ../scripts /scripts RUN chmod +x /scripts/*.sh && \ /scripts/build_llama_and_whisper.sh "asahi" "$LLAMA_CPP_SHA" \ - "$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1" + "$WHISPER_CPP_SHA" diff --git a/container-images/cuda/Containerfile b/container-images/cuda/Containerfile index 129e6479..0e8861ac 100644 --- a/container-images/cuda/Containerfile +++ b/container-images/cuda/Containerfile @@ -1,15 +1,14 @@ # Base image with CUDA for compilation FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder -ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50 +ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest -ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d +ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467 COPY ../scripts /scripts RUN chmod +x /scripts/*.sh && \ /scripts/build_llama_and_whisper.sh "cuda" "$LLAMA_CPP_SHA" \ - "$WHISPER_CPP_SHA" "/tmp/install" \ - "-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined" + "$WHISPER_CPP_SHA" # Final runtime image FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9 diff --git a/container-images/ramalama/Containerfile b/container-images/ramalama/Containerfile index d6d7ce47..3a4e5d01 100644 --- a/container-images/ramalama/Containerfile +++ b/container-images/ramalama/Containerfile @@ -1,13 +1,13 @@ FROM registry.access.redhat.com/ubi9/ubi:9.4-1214.1729773476 -ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50 +ARG LLAMA_CPP_SHA=7585edbdebd02861e0994dae67c9338731fb3fc5 # renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest -ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d +ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467 COPY ../scripts /scripts RUN chmod +x /scripts/*.sh && \ /scripts/build_llama_and_whisper.sh "ramalama" "$LLAMA_CPP_SHA" \ - "$WHISPER_CPP_SHA" "/usr" "-DGGML_KOMPUTE=1" + "$WHISPER_CPP_SHA" ENV WHISPER_CPP_SHA=${WHISPER_CPP_SHA} ENV LLAMA_CPP_SHA=${LLAMA_CPP_SHA} diff --git a/container-images/rocm/Containerfile b/container-images/rocm/Containerfile index fd71f4fb..953b5d99 100644 --- a/container-images/rocm/Containerfile +++ b/container-images/rocm/Containerfile @@ -8,5 +8,5 @@ COPY rocm/rocm.repo /etc/yum.repos.d/ COPY scripts /scripts RUN chmod +x /scripts/*.sh && \ /scripts/build_llama_and_whisper.sh "rocm" "$LLAMA_CPP_SHA" \ - "$WHISPER_CPP_SHA" "/usr" "-DGGML_HIPBLAS=1" + "$WHISPER_CPP_SHA" diff --git a/container-images/scripts/build_llama_and_whisper.sh b/container-images/scripts/build_llama_and_whisper.sh index 062d99fe..1a627cca 100644 --- a/container-images/scripts/build_llama_and_whisper.sh +++ b/container-images/scripts/build_llama_and_whisper.sh @@ -30,7 +30,8 @@ dnf_install() { dnf install -y rocm-dev hipblas-devel rocblas-devel elif [ "$containerfile" = "cuda" ]; then dnf install -y "${rpm_list[@]}" gcc-toolset-12 - source /opt/rh/gcc-toolset-12/enable + # shellcheck disable=SC1091 + . /opt/rh/gcc-toolset-12/enable fi # For Vulkan image, we don't need to install anything extra but rebuild with @@ -39,28 +40,42 @@ dnf_install() { cmake_steps() { local flag="$1" - cmake -B build "${common_flags[@]}" "$flag" + cmake -B build "${cpp_flags[@]}" "$flag" cmake --build build --config Release -j"$(nproc)" cmake --install build } +set_install_prefix() { + if [ "$containerfile" = "cuda" ]; then + install_prefix="/tmp/install" + else + install_prefix="/usr" + fi +} + main() { set -e local containerfile="$1" local llama_cpp_sha="$2" local whisper_cpp_sha="$3" - local install_prefix="$4" - local build_flag_1="$5" - local build_flag_2="$6" - local common_flags=("-DGGML_CCACHE=0" \ - "-DCMAKE_INSTALL_PREFIX=$install_prefix" "$build_flag_1") - if [ -n "$build_flag_2" ]; then - common_flags+=("$build_flag_2") + local install_prefix + set_install_prefix + local common_flags + if [ "$containerfile" = "ramalama" ]; then + common_flags=("-DGGML_KOMPUTE=1") + elif [ "$containerfile" = "rocm" ]; then + common_flags=("-DGGML_HIPBLAS=1") + elif [ "$containerfile" = "cuda" ]; then + common_flags=("-DGGML_NATIVE=OFF" "-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined") + elif [ "$containerfile" = "vulkan" ] || [ "$containerfile" = "asahi" ]; then + common_flags=("-DGGML_VULKAN=1") fi + local cpp_flags=("${common_flags[@]}") + cpp_flags+=("-DGGML_CCACHE=0" \ + "-DCMAKE_INSTALL_PREFIX=$install_prefix") dnf_install - git clone https://github.com/ggerganov/llama.cpp cd llama.cpp git reset --hard "$llama_cpp_sha" @@ -75,9 +90,7 @@ main() { mv build/bin/server "$install_prefix/bin/whisper-server" cd .. - CMAKE_ARGS="${common_flags[*]}" FORCE_CMAKE=1 \ - pip install --prefix="$install_prefix" 'llama-cpp-python[server]' - + CMAKE_ARGS="${common_flags[*]}" pip install "llama-cpp-python[server]" dnf clean all rm -rf /var/cache/*dnf* /opt/rocm-*/lib/llvm \ /opt/rocm-*/lib/rocblas/library/*gfx9* llama.cpp whisper.cpp diff --git a/container-images/vulkan/Containerfile b/container-images/vulkan/Containerfile index b89b8348..dd910442 100644 --- a/container-images/vulkan/Containerfile +++ b/container-images/vulkan/Containerfile @@ -3,5 +3,5 @@ FROM quay.io/ramalama/ramalama:latest COPY ../scripts /scripts RUN chmod +x /scripts/*.sh && \ /scripts/build_llama_and_whisper.sh "vulkan" "$LLAMA_CPP_SHA" \ - "$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1" + "$WHISPER_CPP_SHA" diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 48f171ee..cc5c8567 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -95,7 +95,7 @@ After=local-fs.target [Container] AddDevice=-/dev/dri AddDevice=-/dev/kfd -Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf +Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf Image=quay.io/ramalama/ramalama:latest Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z ContainerName=MyGraniteServer diff --git a/ramalama/cli.py b/ramalama/cli.py index 5694bf56..415c8ef9 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -215,6 +215,7 @@ def configure_arguments(parser): ) parser.add_argument("-v", "--version", dest="version", action="store_true", help="show RamaLama version") + def configure_subcommands(parser): """Add subcommand parsers to the main argument parser.""" subparsers = parser.add_subparsers(dest="subcommand") diff --git a/ramalama/common.py b/ramalama/common.py index a2563e8a..79d2a3ff 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -59,11 +59,6 @@ def exec_cmd(args, stderr=True, debug=False): if debug: perror("exec_cmd: ", *args) - if not stderr: - # Redirecting stderr to /dev/null - with open(os.devnull, "w") as devnull: - os.dup2(devnull.fileno(), sys.stderr.fileno()) - try: return os.execvp(args[0], args) except Exception: diff --git a/ramalama/model.py b/ramalama/model.py index 11ab530d..14e245b3 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -254,36 +254,22 @@ def run(self, args): if not args.container: exec_model_path = model_path - exec_args = [ - "llama-cli", - "-m", - exec_model_path, - "--in-prefix", - "", - "--in-suffix", - "", - "-c", - f"{args.context}", - "--temp", - f"{args.temp}", - ] + exec_args = ["llama-run", "-c", f"{args.context}", "--temp" f"{args.temp}"] if args.seed: exec_args += ["--seed", args.seed] - if not args.debug: - exec_args += ["--no-display-prompt"] - exec_args += [ - "-p", - prompt, - ] - - if not args.ARGS and sys.stdin.isatty(): - exec_args.append("-cnv") + if args.debug: + exec_args += ["-v"] if args.gpu: exec_args.extend(self.gpu_args()) + exec_args += [ + exec_model_path, + prompt, + ] + try: if self.exec_model_in_container(model_path, exec_args, args): return @@ -315,13 +301,15 @@ def serve(self, args): exec_model_path = model_path exec_args = [ - "llama-server", + "python3", + "-m", + "llama_cpp.server", "--port", args.port, - "-m", + "--model", exec_model_path, - "-c", - f"{args.context}", + "--host", + args.host, "--temp", f"{args.temp}", ] @@ -329,8 +317,7 @@ def serve(self, args): exec_args += ["--seed", args.seed] if args.runtime == "vllm": - if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")): - exec_model_path = os.path.dirname(exec_model_path) + exec_model_path = os.path.dirname(exec_model_path) exec_args = ["vllm", "serve", "--port", args.port, exec_model_path] else: if args.gpu: diff --git a/scripts/replace-shas.sh b/scripts/replace-shas.sh new file mode 100755 index 00000000..701f6456 --- /dev/null +++ b/scripts/replace-shas.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail + +find_files() { + grep -rl "$1_CPP_SHA=" container-images/ +} + +sed_files() { + xargs sed -i "s/ARG $1_CPP_SHA=.*/ARG $1_CPP_SHA=$2/g" +} + +find_files "LLAMA" | sed_files "LLAMA" "$1" +find_files "WHISPER" | sed_files "WHISPER" "$2" + diff --git a/test/system/030-run.bats b/test/system/030-run.bats index 5ad8cab6..73aa007a 100755 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -34,7 +34,7 @@ load helpers is "$output" ".*${image} /bin/sh -c" "verify image name" else run_ramalama --dryrun run -c 4096 ${model} - is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix -c 4096 --temp 0.8 --no-display-prompt -p.*' "dryrun correct" + is "$output" 'llama-run -c 4096 --temp 0.8 /path/to/model' "dryrun correct" is "$output" ".*-c 4096" "verify model name" run_ramalama 1 run --ctx-size=4096 --name foobar tiny diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index fbfccd4d..551b4ff1 100644 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -145,7 +145,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat tinyllama.container is "$output" ".*PublishPort=1234" "PublishPort should match" - is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct" is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct" rm tinyllama.container @@ -183,7 +183,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.container is "$output" ".*PublishPort=1234" "PublishPort should match" is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field" - is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct" is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct" if is_container; then @@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.yaml is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image" - is "$output" ".*command: \[\"llama-server\"\]" "Should command" + is "$output" ".*command: \[\"python3\"\]" "Should command" is "$output" ".*containerPort: 1234" "Should container container port" run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model} @@ -244,7 +244,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.yaml is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image" - is "$output" ".*command: \[\"llama-server\"\]" "Should command" + is "$output" ".*command: \[\"python3\"\]" "Should command" is "$output" ".*containerPort: 1234" "Should container container port" run cat $name.kube