diff --git a/.gitignore b/.gitignore index 299425af..b4a77e20 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ ramalama/*.patch dist .#* venv/ +*.container +*.image +*.volume diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 44669698..74a00e31 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -17,6 +17,9 @@ For REST API endpoint documentation, see: [https://github.com/ggerganov/llama.cp ## OPTIONS +#### **--authfile**=*password* +path of the authentication file for OCI registries + #### **--detach**, **-d** Run the container in the background and print the new container ID. The default is TRUE. The --nocontainer option forces this option to False. @@ -40,14 +43,17 @@ Name of the container to run the Model in. #### **--port**, **-p** port for AI Model server to listen on +#### **--tls-verify**=*true* +require HTTPS and verify certificates when contacting OCI registries + ## EXAMPLES ### Run two AI Models at the same time. Notice both are running within Podman Containers. ``` -$ ramalama serve -p 8080 --name mymodel ollama://tiny-llm:latest +$ ramalama serve -d -p 8080 --name mymodel ollama://tiny-llm:latest 09b0e0d26ed28a8418fb5cd0da641376a08c435063317e89cf8f5336baf35cfa -$ ramalama serve -n example --port 8081 oci://quay.io/mmortari/gguf-py-example/v1/example.gguf +$ ramalama serve -d -n example --port 8081 oci://quay.io/mmortari/gguf-py-example/v1/example.gguf 3f64927f11a5da5ded7048b226fbe1362ee399021f5e8058c73949a677b6ac9c $ podman ps @@ -56,11 +62,12 @@ CONTAINER ID IMAGE COMMAND CREATED 3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97 ``` -### Generate a quadlet for running the AI Model service +### Generate quadlet service off of HuggingFace granite Model ``` -$ ramalama serve --name MyGraniteServer --generate=quadlet granite > $HOME/.config/containers/systemd/MyGraniteServer.container -$ cat $HOME/.config/containers/systemd/MyGraniteServer.container +$ ramalama serve --name MyGraniteServer --generate=quadlet granite +Generating quadlet file: MyGraniteServer.container +$ cat MyGraniteServer.container [Unit] Description=RamaLama granite AI Model Service After=local-fs.target @@ -74,26 +81,67 @@ Volume=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite ContainerName=MyGraniteServer PublishPort=8080 -[Install] -# Start by default on boot -WantedBy=multi-user.target default.target +$ mv MyGraniteServer.container $HOME/.config/containers/systemd/ $ systemctl --user daemon-reload $ systemctl start --user MyGraniteServer $ systemctl status --user MyGraniteServer ● MyGraniteServer.service - RamaLama granite AI Model Service Loaded: loaded (/home/dwalsh/.config/containers/systemd/MyGraniteServer.container; generated) Drop-In: /usr/lib/systemd/user/service.d - └─10-timeout-abort.conf + └─10-timeout-abort.conf Active: active (running) since Fri 2024-09-27 06:54:17 EDT; 3min 3s ago Main PID: 3706287 (conmon) Tasks: 20 (limit: 76808) Memory: 1.0G (peak: 1.0G) + ... $ podman ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 7bb35b97a0fe quay.io/ramalama/ramalama:latest llama-server --po... 3 minutes ago Up 3 minutes 0.0.0.0:43869->8080/tcp MyGraniteServer ``` +### Generate quadlet service off of tiny OCI Model +``` +$ ramalama --runtime=vllm serve --name tiny --generate=quadlet oci://quay.io/rhatdan/tiny:latest +Downloading quay.io/rhatdan/tiny:latest... +Trying to pull quay.io/rhatdan/tiny:latest... +Getting image source signatures +Copying blob 65ba8d40e14a skipped: already exists +Copying blob e942a1bf9187 skipped: already exists +Copying config d8e0b28ee6 done | +Writing manifest to image destination +Generating quadlet file: tiny.container +Generating quadlet file: tiny.image +Generating quadlet file: tiny.volume + +$cat tiny.container +[Unit] +Description=RamaLama /run/model/model.file AI Model Service +After=local-fs.target + +[Container] +AddDevice=-/dev/dri +AddDevice=-/dev/kfd +Exec=vllm serve --port 8080 /run/model/model.file +Image=quay.io/ramalama/ramalama:latest +Mount=type=volume,source=tiny:latest.volume,dest=/mnt/models,ro +ContainerName=tiny +PublishPort=8080 + +[Install] +# Start by default on boot +WantedBy=multi-user.target default.target + +$ cat tiny.volume +[Volume] +Driver=image +Image=tiny:latest.image + +$ cat tiny.image +[Image] +Image=quay.io/rhatdan/tiny:latest +``` + ### Generate a kubernetes YAML file named tini ``` $ ramalama serve --name tini --generate kube tiny diff --git a/install.sh b/install.sh index 41381a2d..cf2a49ae 100755 --- a/install.sh +++ b/install.sh @@ -80,7 +80,7 @@ setup_ramalama() { install -m755 "$to_file" "$ramalama_bin" local python_files=("cli.py" "huggingface.py" "model.py" "ollama.py" "common.py" "__init__.py" \ - "oci.py" "version.py" "shortnames.py" "toml_parser.py") + "quadlet.py", "oci.py" "version.py" "shortnames.py" "toml_parser.py") for i in "${python_files[@]}"; do url="${host}/containers/ramalama/${branch}/ramalama/${i}" diff --git a/ramalama/cli.py b/ramalama/cli.py index 3bfbd240..4045f00a 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -605,9 +605,16 @@ def run_cli(args): def serve_parser(subparsers): parser = subparsers.add_parser("serve", help="serve REST API on specified AI Model") + parser.add_argument("--authfile", help="path of the authentication file") parser.add_argument("-d", "--detach", action="store_true", dest="detach", help="run the container in detached mode") parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run") parser.add_argument("-p", "--port", default="8080", help="port for AI Model server to listen on") + parser.add_argument( + "--tls-verify", + dest="tlsverify", + default=True, + help="require HTTPS and verify certificates when contacting registries", + ) parser.add_argument( "--generate", choices=["quadlet", "kube"], diff --git a/ramalama/model.py b/ramalama/model.py index f8b75ccb..61193d38 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -12,6 +12,7 @@ run_cmd, ) from ramalama.version import version +from ramalama.quadlet import Quadlet file_not_found = """\ @@ -310,37 +311,8 @@ def serve(self, args): raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) def quadlet(self, model, args, exec_args): - port_string = "" - if hasattr(args, "port"): - port_string = f"PublishPort={args.port}" - - name_string = "" - if hasattr(args, "name") and args.name: - name_string = f"ContainerName={args.name}" - - if hasattr(args, "MODEL"): - model = args.MODEL - - print( - f""" -[Unit] -Description=RamaLama {model} AI Model Service -After=local-fs.target - -[Container] -AddDevice=-/dev/dri -AddDevice=-/dev/kfd -Exec={" ".join(exec_args)} -Image={default_image()} -Volume={model}:/mnt/models:ro,z -{name_string} -{port_string} - -[Install] -# Start by default on boot -WantedBy=multi-user.target default.target -""" - ) + quadlet = Quadlet(model, args, exec_args) + quadlet.gen_container() def _gen_ports(self, args): if not hasattr(args, "port"): diff --git a/ramalama/oci.py b/ramalama/oci.py index 31d8aa48..756ad347 100644 --- a/ramalama/oci.py +++ b/ramalama/oci.py @@ -153,8 +153,14 @@ def pull(self, args): print(f"Downloading {self.model}...") if args.engine: try: - run_cmd([args.engine, "pull", self.model], debug=args.debug) - return "/mnt/models/model.file" + conman_args = [args.engine, "pull"] + if str(args.tlsverify).lower() == "false": + conman_args.extend([f"--tls-verify={args.tlsverify}"]) + if args.authfile: + conman_args.extend([f"--authfile={args.authfile}"]) + conman_args.extend([self.model]) + run_cmd(conman_args, debug=args.debug) + return "/run/model/model.file" except subprocess.CalledProcessError: pass return self._pull_omlmd(args) diff --git a/ramalama/quadlet.py b/ramalama/quadlet.py new file mode 100644 index 00000000..a613fba9 --- /dev/null +++ b/ramalama/quadlet.py @@ -0,0 +1,82 @@ +import os + +from ramalama.common import default_image + + +class Quadlet: + def __init__(self, model, args, exec_args): + self.ai_image = model + if hasattr(args, "MODEL"): + self.ai_image = args.MODEL + self.ai_image = self.ai_image.removeprefix("oci://") + if args.name: + self.name = args.name + else: + self.name = os.path.basename(self.ai_image) + + self.model = model.removeprefix("oci://") + self.args = args + self.exec_args = exec_args + + def gen_container(self): + port_string = "" + if hasattr(self.args, "port"): + port_string = f"PublishPort={self.args.port}" + + name_string = "" + if hasattr(self.args, "name") and self.args.name: + name_string = f"ContainerName={self.args.name}" + + outfile = self.name + ".container" + print(f"Generating quadlet file: {outfile}") + volume = self.gen_volume() + with open(outfile, 'w') as c: + c.write( + f"""\ +[Unit] +Description=RamaLama {self.model} AI Model Service +After=local-fs.target + +[Container] +AddDevice=-/dev/dri +AddDevice=-/dev/kfd +Exec={" ".join(self.exec_args)} +Image={default_image()} +{volume} +{name_string} +{port_string} + +[Install] +# Start by default on boot +WantedBy=multi-user.target default.target +""" + ) + + def gen_volume(self): + if os.path.exists(self.model): + return f"Volume={self.model}:/mnt/models/model.file,ro:Z" + + outfile = self.name + ".volume" + + self.gen_image() + print(f"Generating quadlet file: {outfile} ") + with open(outfile, 'w') as c: + c.write( + f"""\ +[Volume] +Driver=image +Image={self.name}.image +""" + ) + return f"Mount=type=volume,source={self.name}.volume,dest=/mnt/models,ro" + + def gen_image(self): + outfile = self.name + ".image" + print(f"Generating quadlet file: {outfile} ") + with open(outfile, 'w') as c: + c.write( + f"""\ +[Image] +Image={self.ai_image} +""" + ) diff --git a/setup.py b/setup.py index 3caf1d81..be15b1e7 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,8 @@ def find_package_modules(self, package, package_dir): setuptools.setup( - name = "ramalama", - version = "0.0.20", + name="ramalama", + version="0.0.20", packages=find_packages(), cmdclass={"build_py": build_py}, scripts=["bin/ramalama"], diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index aa766da7..88983469 100644 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -1,6 +1,8 @@ #!/usr/bin/env bats load helpers +load helpers.registry +load setup_suite verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name" @@ -117,14 +119,59 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name model=tiny name=c_$(safename) run_ramalama pull ${model} - run_ramalama serve --name=${name} --port 1234 --generate=quadlet ${model} + run_ramalama serve --port 1234 --generate=quadlet ${model} + is "$output" "Generating quadlet file: tinyllama.container" "generate tinllama.container" + + run cat tinyllama.container is "$output" ".*PublishPort=1234" "PublishPort should match" - is "$output" ".*Name=${name}" "Quadlet should have name field" is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" - run_ramalama 2 serve --name=${name} --port 1234 --generate=bogus ${model} + is "$output" ".*Volume=.*ollama/tinyllama" "Volume line should be correct" + + rm tinyllama.container + run_ramalama 2 serve --name=${name} --port 1234 --generate=bogus tiny is "$output" ".*error: argument --generate: invalid choice: 'bogus' (choose from 'quadlet', 'kube')" "Should fail" } +@test "ramalama serve --generate=quadlet with OCI" { + skip_if_darwin + skip_if_docker + local registry=localhost:${PODMAN_LOGIN_REGISTRY_PORT} + local authfile=$RAMALAMA_TMPDIR/authfile.json + + name=c_$(safename) + start_registry + run_ramalama login --authfile=$authfile \ + --tls-verify=false \ + --username ${PODMAN_LOGIN_USER} \ + --password ${PODMAN_LOGIN_PASS} \ + oci://$registry + run_ramalama pull tiny + run_ramalama push --authfile=$authfile --tls-verify=false tiny oci://$registry/tiny + run_ramalama serve --authfile=$authfile --tls-verify=false --name=${name} --port 1234 --generate=quadlet oci://$registry/tiny + is "$output" ".*Generating quadlet file: ${name}.container" "generate .container file" + is "$output" ".*Generating quadlet file: ${name}.volume" "generate .volume file" + is "$output" ".*Generating quadlet file: ${name}.image" "generate .image file" + + run cat $name.container + is "$output" ".*PublishPort=1234" "PublishPort should match" + is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field" + is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + is "$output" ".*Mount=type=volume,source=${name}.volume,dest=/mnt/models,ro" "Volume line should be correct" + + run cat $name.volume + is "$output" ".*Driver=image" "Driver Image" + is "$output" ".*Image=$name.image" "Image should exist" + + run cat $name.image + is "$output" ".*Image=$registry/tiny" "Image should match" + + rm $name.container + rm $name.volume + rm $name.image + stop_registry +} + + @test "ramalama serve --generate=kube" { model=tiny name=c_$(safename)