Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add llama-run #452

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions container-images/asahi/Containerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
FROM fedora:41

ARG LLAMA_CPP_SHA=1329c0a75e6a7defc5c380eaf80d8e0f66d7da78
ARG LLAMA_CPP_SHA=0827b2c1da299805288abbd556d869318f2b121e
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "asahi" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
"$WHISPER_CPP_SHA"

7 changes: 3 additions & 4 deletions container-images/cuda/Containerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# Base image with CUDA for compilation
FROM docker.io/nvidia/cuda:12.6.2-devel-ubi9 AS builder

ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
ARG LLAMA_CPP_SHA=0827b2c1da299805288abbd556d869318f2b121e
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "cuda" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/tmp/install" \
"-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined"
"$WHISPER_CPP_SHA"

# Final runtime image
FROM docker.io/nvidia/cuda:12.6.2-runtime-ubi9
Expand Down
6 changes: 3 additions & 3 deletions container-images/ramalama/Containerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
FROM registry.access.redhat.com/ubi9/ubi:9.4-1214.1729773476

ARG LLAMA_CPP_SHA=af148c9386da825a60c7038549c121c35ca56b50
ARG LLAMA_CPP_SHA=0827b2c1da299805288abbd556d869318f2b121e
# renovate: datasource=git-refs depName=ggerganov/whisper.cpp packageName=https://github.com/ggerganov/whisper.cpp gitRef=master versioning=loose type=digest
ARG WHISPER_CPP_SHA=6266a9f9e56a5b925e9892acf650f3eb1245814d
ARG WHISPER_CPP_SHA=3de9deead5759eb038966990e3cb5d83984ae467

COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "ramalama" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_KOMPUTE=1"
"$WHISPER_CPP_SHA"

ENV WHISPER_CPP_SHA=${WHISPER_CPP_SHA}
ENV LLAMA_CPP_SHA=${LLAMA_CPP_SHA}
2 changes: 1 addition & 1 deletion container-images/rocm/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ COPY rocm/rocm.repo /etc/yum.repos.d/
COPY scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "rocm" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_HIPBLAS=1"
"$WHISPER_CPP_SHA"

94 changes: 68 additions & 26 deletions container-images/scripts/build_llama_and_whisper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ dnf_install() {
if [ "$containerfile" = "ramalama" ]; then
local url="https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm"
dnf install -y "$url"
crb enable
dnf install -y epel-release
crb enable # this is in epel-release, can only install epel-release via url
dnf --enablerepo=ubi-9-appstream-rpms install -y "${rpm_list[@]}"
local uname_m
uname_m="$(uname -m)"
Expand All @@ -30,54 +29,97 @@ dnf_install() {
dnf install -y rocm-dev hipblas-devel rocblas-devel
elif [ "$containerfile" = "cuda" ]; then
dnf install -y "${rpm_list[@]}" gcc-toolset-12
source /opt/rh/gcc-toolset-12/enable
# shellcheck disable=SC1091
. /opt/rh/gcc-toolset-12/enable
fi

# For Vulkan image, we don't need to install anything extra but rebuild with
# -DGGML_VULKAN
}

cmake_steps() {
local flag="$1"
cmake -B build "${common_flags[@]}" "$flag"
local cmake_flags=("${!1}")
cmake -B build "${cmake_flags[@]}"
cmake --build build --config Release -j"$(nproc)"
cmake --install build
}

main() {
set -e
set_install_prefix() {
if [ "$containerfile" = "cuda" ]; then
install_prefix="/tmp/install"
else
install_prefix="/usr"
fi
}

configure_common_flags() {
local containerfile="$1"
local llama_cpp_sha="$2"
local whisper_cpp_sha="$3"
local install_prefix="$4"
local build_flag_1="$5"
local build_flag_2="$6"
local common_flags=("-DGGML_CCACHE=0" \
"-DCMAKE_INSTALL_PREFIX=$install_prefix" "$build_flag_1")
if [ -n "$build_flag_2" ]; then
common_flags+=("$build_flag_2")
fi
local -n common_flags_ref=$2

dnf_install
common_flags_ref=("-DGGML_NATIVE=OFF")
case "$containerfile" in
rocm)
common_flags_ref+=("-DGGML_HIPBLAS=1")
;;
cuda)
common_flags_ref+=("-DGGML_CUDA=ON" "-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined")
;;
vulkan | asahi)
common_flags_ref+=("-DGGML_VULKAN=1")
;;
esac
}

git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git reset --hard "$llama_cpp_sha"
cmake_steps
cd ..
clone_and_build_whisper_cpp() {
local whisper_cpp_sha="$1"
local install_prefix="$2"
local whisper_flags=("${!3}")
whisper_flags+=("-DBUILD_SHARED_LIBS=NO")

git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
git submodule update --init --recursive
git reset --hard "$whisper_cpp_sha"
cmake_steps "-DBUILD_SHARED_LIBS=NO"
cmake_steps whisper_flags
mkdir -p "$install_prefix/bin"
mv build/bin/main "$install_prefix/bin/whisper-main"
mv build/bin/server "$install_prefix/bin/whisper-server"
cd ..
}

clone_and_build_llama_cpp() {
local llama_cpp_sha="$1"
local common_flags=("${!2}")

CMAKE_ARGS="${common_flags[*]}" FORCE_CMAKE=1 \
pip install --prefix="$install_prefix" 'llama-cpp-python[server]'
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git submodule update --init --recursive
git reset --hard "$llama_cpp_sha"
cmake_steps common_flags
cd ..
}

main() {
set -ex

local containerfile="$1"
local llama_cpp_sha="$2"
local whisper_cpp_sha="$3"
local install_prefix
set_install_prefix
local common_flags
configure_common_flags "$containerfile" common_flags

common_flags+=("-DGGML_CCACHE=0" "-DCMAKE_INSTALL_PREFIX=$install_prefix")
dnf_install
clone_and_build_whisper_cpp "$whisper_cpp_sha" "$install_prefix" common_flags[@]
case "$containerfile" in
ramalama)
common_flags+=("-DGGML_KOMPUTE=ON" "-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON")
;;
esac

clone_and_build_llama_cpp "$llama_cpp_sha" common_flags[@]
dnf clean all
rm -rf /var/cache/*dnf* /opt/rocm-*/lib/llvm \
/opt/rocm-*/lib/rocblas/library/*gfx9* llama.cpp whisper.cpp
Expand Down
2 changes: 1 addition & 1 deletion container-images/vulkan/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ FROM quay.io/ramalama/ramalama:latest
COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
/scripts/build_llama_and_whisper.sh "vulkan" "$LLAMA_CPP_SHA" \
"$WHISPER_CPP_SHA" "/usr" "-DGGML_VULKAN=1"
"$WHISPER_CPP_SHA"

1 change: 1 addition & 0 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def configure_arguments(parser):
)
parser.add_argument("-v", "--version", dest="version", action="store_true", help="show RamaLama version")


def configure_subcommands(parser):
"""Add subcommand parsers to the main argument parser."""
subparsers = parser.add_subparsers(dest="subcommand")
Expand Down
5 changes: 0 additions & 5 deletions ramalama/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ def exec_cmd(args, stderr=True, debug=False):
if debug:
perror("exec_cmd: ", *args)

if not stderr:
# Redirecting stderr to /dev/null
with open(os.devnull, "w") as devnull:
os.dup2(devnull.fileno(), sys.stderr.fileno())

try:
return os.execvp(args[0], args)
except Exception:
Expand Down
39 changes: 12 additions & 27 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,15 +233,15 @@ def run(self, args):
if not args.container:
raise KeyError("--nocontainer and --name options conflict. --name requires a container.")

prompt = "You are a helpful assistant"
prompt = ""
if args.ARGS:
prompt = " ".join(args.ARGS)

# Build a prompt with the stdin text that prepend the prompt passed as
# an argument to ramalama cli
if not sys.stdin.isatty():
input = sys.stdin.read()
prompt = input + "\n\n" + prompt
inp = sys.stdin.read()
prompt = inp + "\n\n" + prompt

if args.dryrun:
model_path = "/path/to/model"
Expand All @@ -254,36 +254,22 @@ def run(self, args):
if not args.container:
exec_model_path = model_path

exec_args = [
"llama-cli",
"-m",
exec_model_path,
"--in-prefix",
"",
"--in-suffix",
"",
"-c",
f"{args.context}",
"--temp",
f"{args.temp}",
]
exec_args = ["llama-run", "-c", f"{args.context}", "--temp", f"{args.temp}"]

if args.seed:
exec_args += ["--seed", args.seed]

if not args.debug:
exec_args += ["--no-display-prompt"]
exec_args += [
"-p",
prompt,
]

if not args.ARGS and sys.stdin.isatty():
exec_args.append("-cnv")
if args.debug:
exec_args += ["-v"]

if args.gpu:
exec_args.extend(self.gpu_args())

exec_args += [
exec_model_path,
prompt,
]

try:
if self.exec_model_in_container(model_path, exec_args, args):
return
Expand Down Expand Up @@ -329,8 +315,7 @@ def serve(self, args):
exec_args += ["--seed", args.seed]

if args.runtime == "vllm":
if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")):
exec_model_path = os.path.dirname(exec_model_path)
exec_model_path = os.path.dirname(exec_model_path)
exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
else:
if args.gpu:
Expand Down
15 changes: 15 additions & 0 deletions scripts/replace-shas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -euo pipefail

find_files() {
grep -rl "$1_CPP_SHA=" container-images/
}

sed_files() {
xargs sed -i "s/ARG $1_CPP_SHA=.*/ARG $1_CPP_SHA=$2/g"
}

find_files "LLAMA" | sed_files "LLAMA" "$1"
find_files "WHISPER" | sed_files "WHISPER" "$2"

2 changes: 1 addition & 1 deletion test/system/030-run.bats
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ load helpers
is "$output" ".*${image} /bin/sh -c" "verify image name"
else
run_ramalama --dryrun run -c 4096 ${model}
is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix -c 4096 --temp 0.8 --no-display-prompt -p.*' "dryrun correct"
is "$output" 'llama-run -c 4096 --temp 0.8 /path/to/model.*' "dryrun correct"
is "$output" ".*-c 4096" "verify model name"

run_ramalama 1 run --ctx-size=4096 --name foobar tiny
Expand Down
Loading