From c4f91bc1dc97975b2167ee34992b198ae55eaea2 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Wed, 13 Nov 2024 19:20:15 +0000 Subject: [PATCH] Add llama-cpp-python server Changed default runtime from 'llama.cpp' to 'llama-cpp-python'. Added 'llama-cpp-python' as a runtime option for better flexibility with the `--runtime` flag. Signed-off-by: Eric Curtin --- docs/ramalama-serve.1.md | 2 +- ramalama/cli.py | 6 +++--- ramalama/model.py | 43 +++++++++++++++++++++++++------------- test/system/040-serve.bats | 8 +++---- test/system/060-info.bats | 2 +- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 4181e5be..12dabbac 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -79,7 +79,7 @@ After=local-fs.target [Container] AddDevice=-/dev/dri AddDevice=-/dev/kfd -Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf +Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf Image=quay.io/ramalama/ramalama:latest Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z ContainerName=MyGraniteServer diff --git a/ramalama/cli.py b/ramalama/cli.py index 2d13ec58..77b0fc9a 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -105,7 +105,7 @@ def load_and_merge_config(): ) ramalama_config['carimage'] = ramalama_config.get('carimage', "registry.access.redhat.com/ubi9-micro:latest") - ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama.cpp') + ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama-cpp-python') ramalama_config['store'] = os.getenv('RAMALAMA_STORE', ramalama_config.get('store', get_store())) ramalama_config['transport'] = os.getenv('RAMALAMA_TRANSPORT', ramalama_config.get('transport', "ollama")) @@ -207,8 +207,8 @@ def configure_arguments(parser): parser.add_argument( "--runtime", default=config.get("runtime"), - choices=["llama.cpp", "vllm"], - help="specify the runtime to use; valid options are 'llama.cpp' and 'vllm'", + choices=["llama-cpp-python", "llama.cpp", "vllm"], + help="specify the runtime to use; valid options are 'llama-cpp-python', 'llama.cpp' and 'vllm'", ) parser.add_argument( "--store", diff --git a/ramalama/model.py b/ramalama/model.py index c8d6dcae..c21ef404 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -284,6 +284,19 @@ def run(self, args): raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'"))) raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) + def execute_model(self, model_path, exec_args, args): + try: + if self.exec_model_in_container(model_path, exec_args, args): + return + if args.dryrun: + dry_run(exec_args) + return + exec_cmd(exec_args, debug=args.debug) + except FileNotFoundError as e: + if in_container(): + raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'"))) + raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) + def serve(self, args): if hasattr(args, "name") and args.name: if not args.container and not args.generate: @@ -300,13 +313,25 @@ def serve(self, args): if not args.container and not args.generate: exec_model_path = model_path - exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path] if args.runtime == "vllm": exec_args = ["vllm", "serve", "--port", args.port, exec_model_path] - else: + elif args.runtime == "llama.cpp": + exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path, "--host", args.host] if args.gpu: exec_args.extend(self.gpu_args()) - exec_args.extend(["--host", args.host]) + + else: + exec_args = [ + "python3", + "-m", + "llama_cpp.server", + "--port", + args.port, + "--model", + exec_model_path, + "--host", + args.host, + ] if args.generate == "quadlet": return self.quadlet(model_path, args, exec_args) @@ -317,17 +342,7 @@ def serve(self, args): if args.generate == "quadlet/kube": return self.quadlet_kube(model_path, args, exec_args) - try: - if self.exec_model_in_container(model_path, exec_args, args): - return - if args.dryrun: - dry_run(exec_args) - return - exec_cmd(exec_args, debug=args.debug) - except FileNotFoundError as e: - if in_container(): - raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'"))) - raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) + self.execute_model(model_path, exec_args, args) def quadlet(self, model, args, exec_args): quadlet = Quadlet(model, args, exec_args) diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index 5c2b4f3d..c9c77eaa 100644 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -136,7 +136,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat tinyllama.container is "$output" ".*PublishPort=1234" "PublishPort should match" - is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct" is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct" rm tinyllama.container @@ -174,7 +174,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.container is "$output" ".*PublishPort=1234" "PublishPort should match" is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field" - is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct" is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct" if is_container; then @@ -226,7 +226,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.yaml is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image" - is "$output" ".*command: \[\"llama-server\"\]" "Should command" + is "$output" ".*command: \[\"python3\"\]" "Should command" is "$output" ".*containerPort: 1234" "Should container container port" run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model} @@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name run cat $name.yaml is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image" - is "$output" ".*command: \[\"llama-server\"\]" "Should command" + is "$output" ".*command: \[\"llama_cpp.server\"\]" "Should command" is "$output" ".*containerPort: 1234" "Should container container port" run cat $name.kube diff --git a/test/system/060-info.bats b/test/system/060-info.bats index 44bf3f91..de4d29cc 100644 --- a/test/system/060-info.bats +++ b/test/system/060-info.bats @@ -17,7 +17,7 @@ load helpers # FIXME Engine (podman|docker|'') tests=" Image | "quay.io/ramalama/ramalama:latest" -Runtime | "llama.cpp" +Runtime | "llama-cpp-python" Version | "${version}" Store | \\\("${HOME}/.local/share/ramalama"\\\|"/var/lib/ramalama"\\\) "