From 1c641e6aac5c18b964e7b32d9dbbb4bf5301d0d7 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Thu, 13 Jun 2024 00:41:52 +0100 Subject: [PATCH 1/4] =?UTF-8?q?`build`:=20rename=20main=20=E2=86=92=20llam?= =?UTF-8?q?a-cli,=20server=20=E2=86=92=20llama-server,=20llava-cli=20?= =?UTF-8?q?=E2=86=92=20llama-llava-cli,=20etc...=20(#7809)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * `main`/`server`: rename to `llama` / `llama-server` for consistency w/ homebrew * server: update refs -> llama-server gitignore llama-server * server: simplify nix package * main: update refs -> llama fix examples/main ref * main/server: fix targets * update more names * Update build.yml * rm accidentally checked in bins * update straggling refs * Update .gitignore * Update server-llm.sh * main: target name -> llama-cli * Prefix all example bins w/ llama- * fix main refs * rename {main->llama}-cmake-pkg binary * prefix more cmake targets w/ llama- * add/fix gbnf-validator subfolder to cmake * sort cmake example subdirs * rm bin files * fix llama-lookup-* Makefile rules * gitignore /llama-* * rename Dockerfiles * rename llama|main -> llama-cli; consistent RPM bin prefixes * fix some missing -cli suffixes * rename dockerfile w/ llama-cli * rename(make): llama-baby-llama * update dockerfile refs * more llama-cli(.exe) * fix test-eval-callback * rename: llama-cli-cmake-pkg(.exe) * address gbnf-validator unused fread warning (switched to C++ / ifstream) * add two missing llama- prefixes * Updating docs for eval-callback binary to use new `llama-` prefix. * Updating a few lingering doc references for rename of main to llama-cli * Updating `run-with-preset.py` to use new binary names. Updating docs around `perplexity` binary rename. * Updating documentation references for lookup-merge and export-lora * Updating two small `main` references missed earlier in the finetune docs. * Update apps.nix * update grammar/README.md w/ new llama-* names * update llama-rpc-server bin name + doc * Revert "update llama-rpc-server bin name + doc" This reverts commit e474ef1df481fd8936cd7d098e3065d7de378930. * add hot topic notice to README.md * Update README.md * Update README.md * rename gguf-split & quantize bins refs in **/tests.sh --------- Co-authored-by: HanClinto --- .devops/cloud-v-pipeline | 2 +- ...a.Dockerfile => llama-cli-cuda.Dockerfile} | 6 +- ....Dockerfile => llama-cli-intel.Dockerfile} | 6 +- ...m.Dockerfile => llama-cli-rocm.Dockerfile} | 4 +- ...Dockerfile => llama-cli-vulkan.Dockerfile} | 6 +- .../{main.Dockerfile => llama-cli.Dockerfile} | 6 +- .devops/llama-cpp-clblast.srpm.spec | 14 +- .devops/llama-cpp-cuda.srpm.spec | 14 +- .devops/llama-cpp.srpm.spec | 14 +- ...ockerfile => llama-server-cuda.Dockerfile} | 6 +- ...ckerfile => llama-server-intel.Dockerfile} | 6 +- ...ockerfile => llama-server-rocm.Dockerfile} | 4 +- ...kerfile => llama-server-vulkan.Dockerfile} | 6 +- ...ver.Dockerfile => llama-server.Dockerfile} | 6 +- .devops/nix/apps.nix | 6 +- .devops/nix/package.nix | 4 +- .devops/tools.sh | 10 +- .dockerignore | 4 +- .github/ISSUE_TEMPLATE/01-bug-low.yml | 2 +- .github/ISSUE_TEMPLATE/02-bug-medium.yml | 2 +- .github/ISSUE_TEMPLATE/03-bug-high.yml | 2 +- .github/ISSUE_TEMPLATE/04-bug-critical.yml | 2 +- .github/workflows/bench.yml | 2 +- .github/workflows/build.yml | 10 +- .github/workflows/docker.yml | 16 +- .github/workflows/server.yml | 4 +- .gitignore | 43 +-- Makefile | 134 ++++++---- README-sycl.md | 18 +- README.md | 33 +-- ci/run.sh | 248 +++++++++--------- docs/HOWTO-add-model.md | 2 +- docs/token_generation_performance_tips.md | 4 +- examples/CMakeLists.txt | 39 +-- examples/Miku.sh | 2 +- examples/baby-llama/CMakeLists.txt | 2 +- examples/base-translate.sh | 2 +- examples/batched-bench/CMakeLists.txt | 2 +- examples/batched-bench/README.md | 8 +- examples/batched.swift/Makefile | 6 +- examples/batched.swift/Package.swift | 4 +- examples/batched.swift/README.md | 2 +- examples/batched/CMakeLists.txt | 2 +- examples/batched/README.md | 2 +- examples/benchmark/CMakeLists.txt | 2 +- examples/chat-13B.sh | 2 +- examples/chat-persistent.sh | 10 +- examples/chat-vicuna.sh | 2 +- examples/chat.sh | 2 +- .../convert-llama2c-to-ggml/CMakeLists.txt | 2 +- examples/convert-llama2c-to-ggml/README.md | 6 +- examples/embedding/CMakeLists.txt | 2 +- examples/embedding/README.md | 4 +- examples/eval-callback/CMakeLists.txt | 4 +- examples/eval-callback/README.md | 2 +- examples/export-lora/CMakeLists.txt | 2 +- examples/export-lora/README.md | 4 +- examples/finetune/CMakeLists.txt | 2 +- examples/finetune/README.md | 12 +- examples/finetune/finetune.sh | 2 +- examples/gbnf-validator/CMakeLists.txt | 4 +- examples/gbnf-validator/gbnf-validator.cpp | 36 ++- examples/gguf-split/CMakeLists.txt | 2 +- examples/gguf-split/tests.sh | 4 +- examples/gguf/CMakeLists.txt | 2 +- examples/gritlm/CMakeLists.txt | 2 +- examples/gritlm/README.md | 2 +- examples/imatrix/CMakeLists.txt | 2 +- examples/imatrix/README.md | 6 +- examples/infill/CMakeLists.txt | 2 +- examples/infill/README.md | 2 +- examples/jeopardy/jeopardy.sh | 2 +- examples/json-schema-pydantic-example.py | 2 +- examples/json_schema_to_grammar.py | 2 +- examples/llama-bench/README.md | 2 +- examples/llava/CMakeLists.txt | 11 +- examples/llava/MobileVLM-README.md | 18 +- examples/llava/README.md | 10 +- examples/llava/android/adb_run.sh | 2 +- examples/lookahead/CMakeLists.txt | 2 +- examples/lookup/CMakeLists.txt | 8 +- examples/lookup/lookup-merge.cpp | 8 +- examples/main-cmake-pkg/CMakeLists.txt | 8 +- examples/main-cmake-pkg/README.md | 4 +- examples/main/CMakeLists.txt | 2 +- examples/main/README.md | 24 +- examples/parallel/CMakeLists.txt | 2 +- examples/passkey/CMakeLists.txt | 2 +- examples/passkey/README.md | 2 +- examples/perplexity/CMakeLists.txt | 2 +- examples/perplexity/perplexity.cpp | 2 +- examples/quantize-stats/CMakeLists.txt | 2 +- examples/quantize/CMakeLists.txt | 2 +- examples/quantize/tests.sh | 6 +- examples/reason-act.sh | 2 +- examples/retrieval/CMakeLists.txt | 2 +- examples/retrieval/README.md | 2 +- examples/rpc/README.md | 2 +- examples/save-load-state/CMakeLists.txt | 2 +- examples/server-llama2-13B.sh | 2 +- examples/server/CMakeLists.txt | 2 +- examples/server/README.md | 22 +- examples/server/bench/README.md | 2 +- examples/server/bench/bench.py | 2 +- examples/server/public_simplechat/readme.md | 4 +- examples/server/tests/README.md | 8 +- examples/server/tests/features/steps/steps.py | 4 +- examples/simple/CMakeLists.txt | 2 +- examples/speculative/CMakeLists.txt | 2 +- examples/sycl/CMakeLists.txt | 2 +- examples/sycl/README.md | 6 +- examples/sycl/run-llama2.sh | 8 +- examples/tokenize/CMakeLists.txt | 2 +- .../train-text-from-scratch/CMakeLists.txt | 2 +- examples/train-text-from-scratch/README.md | 4 +- flake.nix | 2 +- grammars/README.md | 14 +- pocs/vdot/CMakeLists.txt | 4 +- scripts/get-hellaswag.sh | 2 +- scripts/get-wikitext-103.sh | 2 +- scripts/get-wikitext-2.sh | 2 +- scripts/get-winogrande.sh | 2 +- scripts/hf.sh | 6 +- scripts/pod-llama.sh | 56 ++-- scripts/qnt-all.sh | 2 +- scripts/run-all-ppl.sh | 2 +- scripts/run-with-preset.py | 16 +- scripts/server-llm.sh | 8 +- 128 files changed, 584 insertions(+), 584 deletions(-) rename .devops/{main-cuda.Dockerfile => llama-cli-cuda.Dockerfile} (88%) rename .devops/{main-intel.Dockerfile => llama-cli-intel.Dockerfile} (78%) rename .devops/{main-rocm.Dockerfile => llama-cli-rocm.Dockerfile} (94%) rename .devops/{main-vulkan.Dockerfile => llama-cli-vulkan.Dockerfile} (81%) rename .devops/{main.Dockerfile => llama-cli.Dockerfile} (72%) rename .devops/{server-cuda.Dockerfile => llama-server-cuda.Dockerfile} (88%) rename .devops/{server-intel.Dockerfile => llama-server-intel.Dockerfile} (80%) rename .devops/{server-rocm.Dockerfile => llama-server-rocm.Dockerfile} (94%) rename .devops/{server-vulkan.Dockerfile => llama-server-vulkan.Dockerfile} (82%) rename .devops/{server.Dockerfile => llama-server.Dockerfile} (74%) diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline index f3a4944f8a419..af8c0cea6155c 100644 --- a/.devops/cloud-v-pipeline +++ b/.devops/cloud-v-pipeline @@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto stage('Running llama.cpp'){ sh'''#!/bin/bash module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc - qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 + qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 cat llama_log.txt # Printing results ''' } diff --git a/.devops/main-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile similarity index 88% rename from .devops/main-cuda.Dockerfile rename to .devops/llama-cli-cuda.Dockerfile index 2aec4a85dbfe2..d5ce538f61554 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA ENV LLAMA_CUDA=1 -RUN make -j$(nproc) main +RUN make -j$(nproc) llama-cli FROM ${BASE_CUDA_RUN_CONTAINER} as runtime RUN apt-get update && \ apt-get install -y libgomp1 -COPY --from=build /app/main /main +COPY --from=build /app/llama-cli /llama-cli -ENTRYPOINT [ "/main" ] +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/main-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile similarity index 78% rename from .devops/main-intel.Dockerfile rename to .devops/llama-cli-intel.Dockerfile index b7992f47b8f92..6789e17afcc6e 100644 --- a/.devops/main-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -15,12 +15,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ fi && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target main + cmake --build build --config Release --target llama-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime -COPY --from=build /app/build/bin/main /main +COPY --from=build /app/build/bin/llama-cli /llama-cli ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/main" ] +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/main-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile similarity index 94% rename from .devops/main-rocm.Dockerfile rename to .devops/llama-cli-rocm.Dockerfile index dcaeb3e727ae9..7e8a6f0fa208c 100644 --- a/.devops/main-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ -RUN make -j$(nproc) main +RUN make -j$(nproc) llama-cli -ENTRYPOINT [ "/app/main" ] +ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/main-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile similarity index 81% rename from .devops/main-vulkan.Dockerfile rename to .devops/llama-cli-vulkan.Dockerfile index 1bdb528035cd4..7a0abe71f1587 100644 --- a/.devops/main-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key WORKDIR /app COPY . . RUN cmake -B build -DLLAMA_VULKAN=1 && \ - cmake --build build --config Release --target main + cmake --build build --config Release --target llama-cli # Clean up WORKDIR / -RUN cp /app/build/bin/main /main && \ +RUN cp /app/build/bin/llama-cli /llama-cli && \ rm -rf /app ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/main" ] +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/main.Dockerfile b/.devops/llama-cli.Dockerfile similarity index 72% rename from .devops/main.Dockerfile rename to .devops/llama-cli.Dockerfile index d2514c4ba5755..38382bfc9add2 100644 --- a/.devops/main.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -9,15 +9,15 @@ WORKDIR /app COPY . . -RUN make -j$(nproc) main +RUN make -j$(nproc) llama-cli FROM ubuntu:$UBUNTU_VERSION as runtime RUN apt-get update && \ apt-get install -y libgomp1 -COPY --from=build /app/main /main +COPY --from=build /app/llama-cli /llama-cli ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/main" ] +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec index 774f63ddd5c4e..013952191127f 100644 --- a/.devops/llama-cpp-clblast.srpm.spec +++ b/.devops/llama-cpp-clblast.srpm.spec @@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamaclblast -cp -p server %{buildroot}%{_bindir}/llamaclblastserver -cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple +cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli +cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server +cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple mkdir -p %{buildroot}/usr/lib/systemd/system %{__cat} < %{buildroot}/usr/lib/systemd/system/llamaclblast.service @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS +ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,9 +67,9 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamaclblast -%{_bindir}/llamaclblastserver -%{_bindir}/llamaclblastsimple +%{_bindir}/llama-clblast-cli +%{_bindir}/llama-clblast-server +%{_bindir}/llama-clblast-simple /usr/lib/systemd/system/llamaclblast.service %config /etc/sysconfig/llama diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index ba9cb7cbb824f..cbdf4362629d3 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppcuda -cp -p server %{buildroot}%{_bindir}/llamacppcudaserver -cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple +cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli +cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server +cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple mkdir -p %{buildroot}/usr/lib/systemd/system %{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS +ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,9 +67,9 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppcuda -%{_bindir}/llamacppcudaserver -%{_bindir}/llamacppcudasimple +%{_bindir}/llama-cuda-cli +%{_bindir}/llama-cuda-server +%{_bindir}/llama-cuda-simple /usr/lib/systemd/system/llamacuda.service %config /etc/sysconfig/llama diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 1d9e4f425b43a..4d5560089816c 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -38,9 +38,9 @@ make -j %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llama -cp -p server %{buildroot}%{_bindir}/llamaserver -cp -p simple %{buildroot}%{_bindir}/llamasimple +cp -p llama-cli %{buildroot}%{_bindir}/llama-cli +cp -p llama-server %{buildroot}%{_bindir}/llama-server +cp -p llama-simple %{buildroot}%{_bindir}/llama-simple mkdir -p %{buildroot}/usr/lib/systemd/system %{__cat} < %{buildroot}/usr/lib/systemd/system/llama.service @@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamaserver $LLAMA_ARGS +ExecStart=/usr/bin/llama-server $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -69,9 +69,9 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llama -%{_bindir}/llamaserver -%{_bindir}/llamasimple +%{_bindir}/llama-cli +%{_bindir}/llama-server +%{_bindir}/llama-simple /usr/lib/systemd/system/llama.service %config /etc/sysconfig/llama diff --git a/.devops/server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile similarity index 88% rename from .devops/server-cuda.Dockerfile rename to .devops/llama-server-cuda.Dockerfile index 4e9747b823196..0010ffd4c5465 100644 --- a/.devops/server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 -RUN make -j$(nproc) server +RUN make -j$(nproc) llama-server FROM ${BASE_CUDA_RUN_CONTAINER} as runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 -COPY --from=build /app/server /server +COPY --from=build /app/llama-server /llama-server -ENTRYPOINT [ "/server" ] +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile similarity index 80% rename from .devops/server-intel.Dockerfile rename to .devops/llama-server-intel.Dockerfile index c5adcb6da0408..cec43645233d1 100644 --- a/.devops/server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -15,15 +15,15 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ fi && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target server + cmake --build build --config Release --target llama-server FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev -COPY --from=build /app/build/bin/server /server +COPY --from=build /app/build/bin/llama-server /llama-server ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/server" ] +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile similarity index 94% rename from .devops/server-rocm.Dockerfile rename to .devops/llama-server-rocm.Dockerfile index a6b76dee8494f..f88cf20e5b981 100644 --- a/.devops/server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -45,6 +45,6 @@ ENV LLAMA_CURL=1 RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev -RUN make -j$(nproc) +RUN make -j$(nproc) llama-server -ENTRYPOINT [ "/app/server" ] +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile similarity index 82% rename from .devops/server-vulkan.Dockerfile rename to .devops/llama-server-vulkan.Dockerfile index 6e757e171efee..b0fa0b8e656b5 100644 --- a/.devops/server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -19,13 +19,13 @@ RUN apt-get update && \ WORKDIR /app COPY . . RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ - cmake --build build --config Release --target server + cmake --build build --config Release --target llama-server # Clean up WORKDIR / -RUN cp /app/build/bin/server /server && \ +RUN cp /app/build/bin/llama-server /llama-server && \ rm -rf /app ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/server" ] +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/server.Dockerfile b/.devops/llama-server.Dockerfile similarity index 74% rename from .devops/server.Dockerfile rename to .devops/llama-server.Dockerfile index bee63b966d5c0..aa93369bebebe 100644 --- a/.devops/server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -11,15 +11,15 @@ COPY . . ENV LLAMA_CURL=1 -RUN make -j$(nproc) server +RUN make -j$(nproc) llama-server FROM ubuntu:$UBUNTU_VERSION as runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 -COPY --from=build /app/server /server +COPY --from=build /app/llama-server /llama-server ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/server" ] +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index b8a12cc0a0463..897fce4d324c1 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -6,11 +6,11 @@ let inherit (config.packages) default; binaries = [ - "llama" + "llama-cli" "llama-embedding" "llama-server" - "quantize" - "train-text-from-scratch" + "llama-quantize" + "llama-train-text-from-scratch" ]; mkApp = name: { type = "app"; diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index e8d5b0bd92d2c..87bb3a20f2a28 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation ( # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # if they haven't been added yet. postInstall = '' - mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix} - mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix} mkdir -p $out/include cp $src/llama.h $out/include/ ''; @@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation ( license = lib.licenses.mit; # Accommodates `nix run` and `lib.getExe` - mainProgram = "llama"; + mainProgram = "llama-cli"; # These people might respond, on the best effort basis, if you ping them # in case of Nix-specific regressions or for reviewing Nix-specific PRs. diff --git a/.devops/tools.sh b/.devops/tools.sh index 97424c3aa746a..335382f69ee1f 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -10,11 +10,11 @@ shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then python3 ./convert-hf-to-gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./quantize "$@" + ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./main "$@" + ./llama-cli "$@" elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then - ./finetune "$@" + ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./quantize "$i" "${i/f16/q4_0}" q4_0 + ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./server "$@" + ./llama-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " diff --git a/.dockerignore b/.dockerignore index 633bbc3a971c1..8916e2a660732 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,8 +12,8 @@ build*/ models/* -/main -/quantize +/llama-cli +/llama-quantize arm_neon.h compile_commands.json diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml index bfb9d9a0692c4..54785854f776e 100644 --- a/.github/ISSUE_TEMPLATE/01-bug-low.yml +++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./main --version + $./llama-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml index e8297eea03551..a6285c6f05bac 100644 --- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml +++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./main --version + $./llama-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml index 3c9d50d169720..ff816b93769c3 100644 --- a/.github/ISSUE_TEMPLATE/03-bug-high.yml +++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./main --version + $./llama-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml index d089d5fa10cfc..7af42a80b3b93 100644 --- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml +++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml @@ -24,7 +24,7 @@ body: label: Name and Version description: Which executable and which version of our software are you running? (use `--version` to get a version string) placeholder: | - $./main --version + $./llama-cli --version version: 2999 (42b4109e) built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu validations: diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index de0d994c80e9d..88ab4844ef123 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -119,7 +119,7 @@ jobs: -DLLAMA_FATAL_WARNINGS=OFF \ -DLLAMA_ALL_WARNINGS=OFF \ -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target server + cmake --build build --config Release -j $(nproc) --target llama-server - name: Download the dataset id: download_dataset diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3c04cfc295fb8..81ce770cce3a1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -103,12 +103,10 @@ jobs: id: cmake_build run: | sysctl -a - mkdir build - cd build # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON .. - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test id: cmake_test @@ -241,8 +239,8 @@ jobs: wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin echo "Fetch llama2c model" wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - name: Determine tag name id: tag diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9b03d19bc77c6..6244b481210a0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -30,20 +30,20 @@ jobs: strategy: matrix: config: - - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" } # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # have disabled them for now until the reason why # is understood. - - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v4 diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 0d16ef5f110f9..1fee9ac281943 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -96,7 +96,7 @@ jobs: -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - name: Tests id: server_integration_tests @@ -136,7 +136,7 @@ jobs: id: cmake_build run: | cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup id: setup_python diff --git a/.gitignore b/.gitignore index 5223c6963b155..5296594952c4a 100644 --- a/.gitignore +++ b/.gitignore @@ -46,48 +46,9 @@ models/* models-mnt /Pipfile -/baby-llama -/beam-search -/benchmark-matmult -/convert-llama2c-to-ggml -/embd-input-test -/embedding -/eval-callback -/gguf -/gguf-llama-simple -/gguf-split -/gritlm -/imatrix -/infill /libllama.so -/llama-bench -/llava-cli -/lookahead -/lookup -/lookup-create -/lookup-merge -/lookup-stats -/main -/metal -/passkey -/perplexity -/q8dot -/quantize -/quantize-stats -/result -/save-load-state -/server -/simple -/batched -/batched-bench -/export-lora -/finetune -/retrieval -/speculative -/parallel -/train-text-from-scratch -/tokenize -/vdot +/llama-* +llama-batched-swift /common/build-info.cpp arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index 895c62f84def0..a4cab1bb208bc 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,44 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o + libllava.a \ + llama-baby-llama \ + llama-batched \ + llama-batched-bench \ + llama-bench \ + llama-benchmark-matmult \ + llama-cli \ + llama-convert-llama2c-to-ggml \ + llama-embedding \ + llama-eval-callback \ + llama-export-lora \ + llama-finetune \ + llama-gbnf-validator \ + llama-gguf \ + llama-gguf-split \ + llama-gritlm \ + llama-imatrix \ + llama-infill \ + llama-llava-cli \ + llama-lookahead \ + llama-lookup \ + llama-lookup-create \ + llama-lookup-merge \ + llama-lookup-stats \ + llama-parallel \ + llama-passkey \ + llama-perplexity \ + llama-q8dot \ + llama-quantize \ + llama-quantize-stats \ + llama-retrieval \ + llama-save-load-state \ + llama-server \ + llama-simple \ + llama-speculative \ + llama-tokenize \ + llama-train-text-from-scratch \ + llama-vdot \ + tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -777,7 +813,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: - rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf ggml-cuda/*.o rm -vrf ggml-cuda/template-instances/*.o find examples pocs -type f -name "*.o" -delete @@ -793,62 +829,62 @@ clean: # Helper function that replaces .c, .cpp, and .cu file endings with .o: GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) -main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo - @echo '==== Run ./main -h for help. ====' + @echo '==== Run ./llama-cli -h for help. ====' @echo -infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) +llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) @@ -861,23 +897,23 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) +llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) +llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -888,55 +924,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual -llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) +llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS) - $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS) - $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS) - -passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + +llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -962,20 +1004,20 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) +llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -run-benchmark-matmult: benchmark-matmult +run-benchmark-matmult: llama-benchmark-matmult ./$@ .PHONY: run-benchmark-matmult swift -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) +llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) +llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/README-sycl.md b/README-sycl.md index 62b38135c01c0..93b623daf6a1a 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, *Notes:* - **Memory** - - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`. + - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. @@ -99,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets. ### Build image ```sh # Using FP16 -docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . +docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . ``` *Notes*: To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. -You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative. +You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative. ### Run container @@ -275,7 +275,7 @@ source /opt/intel/oneapi/setvars.sh Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh -./build/bin/ls-sycl-device +./build/bin/llama-ls-sycl-device ``` A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: ``` @@ -313,7 +313,7 @@ Examples: - Use device 0: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 +ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` or run by script: @@ -324,7 +324,7 @@ or run by script: - Use multiple devices: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer +ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` Otherwise, you can run the script: @@ -427,7 +427,7 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in *Notes:* -- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`. +- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`. ### III. Run the inference @@ -488,13 +488,13 @@ Examples: - Use device 0: ``` -build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 +build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 ``` - Use multiple devices: ``` -build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer +build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` Otherwise, run the following wrapper script: diff --git a/README.md b/README.md index 8c065aace09be..d1c6190ddfd7b 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +> [!IMPORTANT] +[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809) + ### Recent API changes - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 @@ -217,7 +220,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: Here is a typical run using LLaMA v2 13B on M2 Ultra: ``` -$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e +$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm @@ -555,7 +558,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh # Build the image - docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile . + docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile . # Then, use it: docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 @@ -586,7 +589,7 @@ Building the program with BLAS support may lead to some performance improvements cmake -B build -DLLAMA_VULKAN=1 cmake --build build --config Release # Test the output binary (with "-ngl 33" to offload all layers to GPU) - ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 + ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 # You should see in the output, ggml_vulkan detected your GPU. For example: # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 @@ -623,17 +626,17 @@ python3 convert-hf-to-gguf.py models/mymodel/ python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe # quantize the model to 4-bits (using Q4_K_M method) -./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M +./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M # update the gguf filetype to current version if older version is now unsupported -./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY ``` ### Run the quantized model ```bash # start inference on a gguf model -./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 +./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -708,7 +711,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread #### How to run 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip -2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` +2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 3. Output: ``` perplexity : calculating perplexity over 655 chunks @@ -732,16 +735,16 @@ Here is an example of a few-shot interaction, invoked with the command ./examples/chat-13B.sh # custom arguments using a 13B model -./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt +./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` -Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. +Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ### Persistent Interaction -The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. +The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. ```bash # Start a new chat @@ -763,7 +766,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: ```bash -./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' +./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' ``` The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). @@ -842,7 +845,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho Now, you can start chatting: ``` $cd /data/data/com.termux/files/home/bin -$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml +$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml ``` Here's a demo of an interactive session running on Pixel 5 phone: @@ -909,8 +912,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ```bash docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . -docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . -docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . +docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile . +docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. @@ -960,7 +963,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m ### Docs -- [main](./examples/main/README.md) +- [main (cli)](./examples/main/README.md) - [server](./examples/server/README.md) - [jeopardy](./examples/jeopardy/README.md) - [BLIS](./docs/BLIS.md) diff --git a/ci/run.sh b/ci/run.sh index 3fc5f48b2e2af..291c44f47b86d 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -437,45 +437,45 @@ function gg_run_pythia_1_4b { wiki_test_60="${path_wiki}/wiki.test-60.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -569,47 +569,47 @@ function gg_run_pythia_2_8b { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -693,10 +693,10 @@ function gg_run_embd_bge_small { model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log set +e } diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index 1381242485960..3eec077ea7bd9 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. -Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). +Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback). ## GGUF specification diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index 3c43431471243..c0840cad57fb3 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -3,7 +3,7 @@ ## Verifying that the model is running on the GPU with CUDA Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell -./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " +./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: @@ -27,7 +27,7 @@ RAM: 32GB Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) -Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` +Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Result: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 53002f8e1ce76..d6ce35f4cc4e9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,42 +13,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(baby-llama) - add_subdirectory(batched) add_subdirectory(batched-bench) + add_subdirectory(batched) add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) + add_subdirectory(export-lora) add_subdirectory(finetune) - add_subdirectory(gritlm) + add_subdirectory(gbnf-validator) add_subdirectory(gguf-split) + add_subdirectory(gguf) + add_subdirectory(gritlm) + add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) - if (LLAMA_SYCL) - add_subdirectory(sycl) - endif() + add_subdirectory(lookahead) + add_subdirectory(lookup) add_subdirectory(main) - add_subdirectory(tokenize) add_subdirectory(parallel) + add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize) add_subdirectory(quantize-stats) + add_subdirectory(quantize) add_subdirectory(retrieval) + if (LLAMA_RPC) + add_subdirectory(rpc) + endif() + if (LLAMA_BUILD_SERVER) + add_subdirectory(server) + endif() + if (LLAMA_SYCL) + add_subdirectory(sycl) + endif() add_subdirectory(save-load-state) add_subdirectory(simple) - add_subdirectory(passkey) add_subdirectory(speculative) - add_subdirectory(lookahead) - add_subdirectory(lookup) - add_subdirectory(gguf) + add_subdirectory(tokenize) add_subdirectory(train-text-from-scratch) - add_subdirectory(imatrix) - if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - add_subdirectory(export-lora) - if (LLAMA_RPC) - add_subdirectory(rpc) - endif() endif() diff --git a/examples/Miku.sh b/examples/Miku.sh index b9174b4e6e126..0f6c8c8787107 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then GEN_OPTIONS+=(--threads "$N_THREAD") fi -./main "${GEN_OPTIONS[@]}" \ +./llama-cli "${GEN_OPTIONS[@]}" \ --model "$MODEL" \ --in-prefix " " \ --in-suffix "${AI_NAME}:" \ diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index 7b70227a525e1..71b82105c8863 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET baby-llama) +set(TARGET llama-baby-llama) add_executable(${TARGET} baby-llama.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/base-translate.sh b/examples/base-translate.sh index 00dedd0df7dde..103a52f55e6db 100755 --- a/examples/base-translate.sh +++ b/examples/base-translate.sh @@ -58,4 +58,4 @@ echo "$2 model=$1 # generate the most likely continuation until the string "===" is found -./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs +./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 40a032c514d11..959acaeeebc38 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET batched-bench) +set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index fa4baf6403e9e..4a07fe6bbf268 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] +./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 +./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps # custom set of batches -./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile index 2afb24fb85a1a..1f9156e583fdd 100755 --- a/examples/batched.swift/Makefile +++ b/examples/batched.swift/Makefile @@ -1,6 +1,6 @@ .PHONY: build build: - xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build - rm -f ./batched_swift - ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift + xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build + rm -f ./llama-batched-swift + ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift index 826491defd863..7e8afd0843c5b 100644 --- a/examples/batched.swift/Package.swift +++ b/examples/batched.swift/Package.swift @@ -4,7 +4,7 @@ import PackageDescription let package = Package( - name: "batched_swift", + name: "llama-batched-swift", platforms: [.macOS(.v12)], dependencies: [ .package(name: "llama", path: "../../"), @@ -13,7 +13,7 @@ let package = Package( // Targets are the basic building blocks of a package, defining a module or a test suite. // Targets can depend on other targets in this package and products from dependencies. .executableTarget( - name: "batched_swift", + name: "llama-batched-swift", dependencies: ["llama"], path: "Sources", linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 4c2721fe85b00..7f2e2fcdcf4a7 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,4 @@ This is a swift clone of `examples/batched`. $ `make` -$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]` +$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 6aa178d4d5911..77e33343b6673 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET batched) +set(TARGET llama-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/batched/README.md b/examples/batched/README.md index ed204c3088882..6013aab01fddc 100644 --- a/examples/batched/README.md +++ b/examples/batched/README.md @@ -3,7 +3,7 @@ The example demonstrates batched generation from a given prompt ```bash -./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 +./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ... diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 2bb47bab5a868..34a58cc02abaf 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET benchmark) +set(TARGET llama-bench-matmult) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh index 35c089d57d253..1828903c31670 100755 --- a/examples/chat-13B.sh +++ b/examples/chat-13B.sh @@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ $PROMPT_TEMPLATE > $PROMPT_FILE # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS -./main $GEN_OPTIONS \ +./llama-cli $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --n_predict "$N_PREDICTS" \ diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index 22f5b83d3da06..d9cab9836482e 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -62,7 +62,7 @@ fi if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then echo 'Prompt cache does not exist, building...' # Default batch_size to 64 here for better user feedback during initial prompt processing - ./main 2>>"$LOG" \ + ./llama-cli 2>>"$LOG" \ --batch_size 64 \ "${OPTS[@]}" \ --prompt-cache "$PROMPT_CACHE_FILE" \ @@ -109,13 +109,13 @@ while read -e line; do printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE" - ./main 2>>"$LOG" "${OPTS[@]}" \ + ./llama-cli 2>>"$LOG" "${OPTS[@]}" \ --prompt-cache "$CUR_PROMPT_CACHE" \ --prompt-cache-all \ --file "$CUR_PROMPT_FILE" \ --reverse-prompt "${USER_NAME}:" \ --n_predict "$n_predict" | - skip_bytes 1 | # skip BOS token added by ./main + skip_bytes 1 | # skip BOS token added by ./llama-cli tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file skip_bytes "$n_prompt_len_pre" # print generation @@ -133,7 +133,7 @@ while read -e line; do # TODO get both messages in one go if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then - echo >&2 "Couldn't get number of tokens from ./main output!" + echo >&2 "Couldn't get number of tokens from ./llama-cli output!" exit 1 fi @@ -144,7 +144,7 @@ while read -e line; do fi # Update cache for next prompt in background, ideally during user input - ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ + ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ --prompt-cache "$NEXT_PROMPT_CACHE" \ --file "$NEXT_PROMPT_FILE" \ --n_predict 1 & diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh index 8c7b7bef42784..ffdd200849503 100755 --- a/examples/chat-vicuna.sh +++ b/examples/chat-vicuna.sh @@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ $PROMPT_TEMPLATE > $PROMPT_FILE # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS -./bin/main $GEN_OPTIONS \ +./bin/llama-cli $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --n_predict "$N_PREDICTS" \ diff --git a/examples/chat.sh b/examples/chat.sh index d567acecdff11..9f85d1e265d00 100755 --- a/examples/chat.sh +++ b/examples/chat.sh @@ -11,6 +11,6 @@ cd .. # # "--keep 48" is based on the contents of prompts/chat-with-bob.txt # -./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ +./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ --repeat_penalty 1.0 --color -i \ -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt index e262d44f98496..a6790e617217e 100644 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET convert-llama2c-to-ggml) +set(TARGET llama-convert-llama2c-to-ggml) add_executable(${TARGET} convert-llama2c-to-ggml.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 742dcf7a35b50..5774ac83c32c8 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu After successful compilation, following usage options are available: ``` -usage: ./convert-llama2c-to-ggml [options] +usage: ./llama-convert-llama2c-to-ggml [options] options: -h, --help show this help message and exit @@ -19,10 +19,10 @@ options: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: -`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` +`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). Now you can use the model with a command like: -`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` +`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 8ffc33868401f..8256e789ad33a 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET embedding) +set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 6929454c5e549..2298ec3e7fcb3 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index c56ba780b215f..a48753d38e16e 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -1,9 +1,9 @@ -set(TARGET eval-callback) +set(TARGET llama-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md index 66a37e8783a9b..63a57ad6b68e5 100644 --- a/examples/eval-callback/README.md +++ b/examples/eval-callback/README.md @@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data. Usage: ```shell -eval-callback \ +llama-eval-callback \ --hf-repo ggml-org/models \ --hf-file phi-2/ggml-model-q4_0.gguf \ --model phi-2-q4_0.gguf \ diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index cbbdaec67488d..1cef6e71694e2 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET export-lora) +set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 0cf3e8e4549bb..1fb17feeca627 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -3,7 +3,7 @@ Apply LORA adapters to base model and export the resulting model. ``` -usage: export-lora [options] +usage: llama-export-lora [options] options: -h, --help show this help message and exit @@ -17,7 +17,7 @@ options: For example: ```bash -./bin/export-lora \ +./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt index 2b52d21cfb381..64afe6ddc647a 100644 --- a/examples/finetune/CMakeLists.txt +++ b/examples/finetune/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET finetune) +set(TARGET llama-finetune) add_executable(${TARGET} finetune.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/finetune/README.md b/examples/finetune/README.md index 2fafd505e5447..a6ae64983bc0a 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -7,7 +7,7 @@ Basic usage instructions: wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt # finetune LORA adapter -./bin/finetune \ +./bin/llama-finetune \ --model-base open-llama-3b-v2-q8_0.gguf \ --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ @@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s --use-checkpointing # predict -./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin ``` **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). @@ -38,14 +38,14 @@ After 10 more iterations: Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. -These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above. +These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above. -In `main` you can also load multiple LORA adapters, which will then be mixed together. +In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together. For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: ```bash -./bin/main -m open-llama-3b-v2-q8_0.gguf \ +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin ``` @@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: ```bash -./bin/main -m open-llama-3b-v2-q8_0.gguf \ +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index 079bfa1139d5b..d7f2165e5cb3a 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -2,7 +2,7 @@ cd `dirname $0` cd ../.. -EXE="./finetune" +EXE="./llama-finetune" if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index 166e3ad2ae7dd..4edd6ec7394c5 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET gbnf-validator) +set(TARGET llama-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 091069ffa699c..0406dc3398b8a 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -69,13 +71,14 @@ int main(int argc, char** argv) { return 1; } - fseek(grammar_file, 0, SEEK_END); - size_t grammar_size = ftell(grammar_file); - fseek(grammar_file, 0, SEEK_SET); - - std::string grammar_str(grammar_size, ' '); - fread(&grammar_str[0], 1, grammar_size, grammar_file); - fclose(grammar_file); + std::string grammar_str; + { + std::ifstream grammar_file(grammar_filename); + GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file"); + std::stringstream buffer; + buffer << grammar_file.rdbuf(); + grammar_str = buffer.str(); + } // Parse the GBNF grammar auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); @@ -100,20 +103,15 @@ int main(int argc, char** argv) { grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); // Read the input file - FILE* input_file = fopen(input_filename.c_str(), "r"); - if (!input_file) { - fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str()); - return 1; + std::string input_str; + { + std::ifstream input_file(input_filename); + GGML_ASSERT(input_file.is_open() && "Failed to open input file"); + std::stringstream buffer; + buffer << input_file.rdbuf(); + input_str = buffer.str(); } - fseek(input_file, 0, SEEK_END); - size_t input_size = ftell(input_file); - fseek(input_file, 0, SEEK_SET); - - std::string input_str(input_size, ' '); - fread(&input_str[0], 1, input_size, input_file); - fclose(input_file); - // Validate the input string against the grammar size_t error_pos; std::string error_msg; diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index 828e624352c8d..f63887da7dfca 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gguf-split) +set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index 3bc0fa47110e3..d5a92d6051063 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -18,8 +18,8 @@ fi set -x -SPLIT=$1/gguf-split -MAIN=$1/main +SPLIT=$1/llama-gguf-split +MAIN=$1/llama-cli WORK_PATH=$TMP_DIR/gguf-split ROOT_DIR=$(realpath $(dirname $0)/../../) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index 6481f087bc997..a9569b411956b 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gguf) +set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index ac4a5ae7937ea..86dfddca346fe 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gritlm) +set(TARGET llama-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md index a3a3c1389ca27..786ba57363def 100644 --- a/examples/gritlm/README.md +++ b/examples/gritlm/README.md @@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou Run the example using the downloaded model: ```console -$ ./gritlm -m models/gritlm-7b_q4_1.gguf +$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index d688a16209049..d4c8265bdb9d2 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET imatrix) +set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 866ca9f5682ac..38b36ee5a26fd 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/ ## Usage ``` -./imatrix \ +./llama-imatrix \ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] @@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument LLAMA_CUDA=1 make -j # generate importance matrix (imatrix.dat) -./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 +./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization -./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index e4e8028da09da..9b1aa3b63c920 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET infill) +set(TARGET llama-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/infill/README.md b/examples/infill/README.md index 6b076c8390abf..74f42d2fc2696 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu ``` ```bash -./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " +./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " ``` diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh index 9bdbc755c13a7..07bcb3b8d78ac 100755 --- a/examples/jeopardy/jeopardy.sh +++ b/examples/jeopardy/jeopardy.sh @@ -21,7 +21,7 @@ counter=1 echo 'Running' while IFS= read -r question do - exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" + exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" echo $counter echo "Current Question: $question" eval "$exe_cmd" diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py index 69ebfd4093824..cc64e572bac07 100644 --- a/examples/json-schema-pydantic-example.py +++ b/examples/json-schema-pydantic-example.py @@ -1,5 +1,5 @@ # Usage: -#! ./server -m some-model.gguf & +#! ./llama-server -m some-model.gguf & #! pip install pydantic #! python json-schema-pydantic-example.py diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index ab19e20df658e..b588497b99f90 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -523,7 +523,7 @@ def format_grammar(self): def main(args_in = None): parser = argparse.ArgumentParser( description=''' - Generates a grammar (suitable for use in ./main) that produces JSON conforming to a + Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a given JSON schema. Only a subset of JSON schema features are supported; more may be added in the future. ''', diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index fd95b35f46595..52b0e74d3dbf9 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -1,4 +1,4 @@ -# llama.cpp/example/llama-bench +# llama.cpp/examples/llama-bench Performance testing tool for llama.cpp. diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 2985caff8379a..e9fa73acb097b 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -30,8 +30,9 @@ if(TARGET BUILD_INFO) add_dependencies(llava BUILD_INFO) endif() -set(TARGET llava-cli) -add_executable(llava-cli llava-cli.cpp) -install(TARGETS llava-cli RUNTIME) -target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(llava PRIVATE cxx_std_11) +set(TARGET llama-llava-cli) +add_executable(${TARGET} llava-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 74f021dec5e17..05a8207e67b88 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. ## Usage -Build with cmake or run `make llava-cli` to build it. +Build with cmake or run `make llama-llava-cli` to build it. -After building, run: `./llava-cli` to see the usage. For example: +After building, run: `./llama-llava-cli` to see the usage. For example: ```sh -./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ +./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ --image path/to/an/image.jpg \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" @@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` ```sh -./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ``` Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. @@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path` ### case 1 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms ### case 2 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms #### llava-cli release-b2005 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 ### case 1 **input** ```sh -./llava-cli \ +./llama-llava-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ --image /data/local/tmp/demo.jpeg \ @@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens ### case 2 **input** ```sh -./llava-cli \ +./llama-llava-cli \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" \ diff --git a/examples/llava/README.md b/examples/llava/README.md index 8d1ae5270e458..f4554de676e4c 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h After API is confirmed, more models will be supported / uploaded. ## Usage -Build with cmake or run `make llava-cli` to build it. +Build with cmake or run `make llama-llava-cli` to build it. -After building, run: `./llava-cli` to see the usage. For example: +After building, run: `./llama-llava-cli` to see the usage. For example: ```sh -./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg +./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. @@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -7) And finally we can run the llava-cli using the 1.6 model version: +7) And finally we can run the llava cli using the 1.6 model version: ```console -./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 +./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh index f73623ae3b129..45ccf8d70d863 100755 --- a/examples/llava/android/adb_run.sh +++ b/examples/llava/android/adb_run.sh @@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant. # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" program_dir="build_64/bin" -binName="llava-cli" +binName="llama-llava-cli" n_threads=4 diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt index 8827e3f11ecd6..f0ae5cd89244c 100644 --- a/examples/lookahead/CMakeLists.txt +++ b/examples/lookahead/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET lookahead) +set(TARGET llama-lookahead) add_executable(${TARGET} lookahead.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index b91633f63e4ee..ef19fe25e31a3 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -1,22 +1,22 @@ -set(TARGET lookup) +set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -set(TARGET lookup-create) +set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -set(TARGET lookup-merge) +set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -set(TARGET lookup-stats) +set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp index 07c93eb8d057b..81e2b04369b90 100644 --- a/examples/lookup/lookup-merge.cpp +++ b/examples/lookup/lookup-merge.cpp @@ -11,14 +11,14 @@ #include #include -static void print_usage() { +static void print_usage(char* argv0) { fprintf(stderr, "Merges multiple lookup cache files into a single one.\n"); - fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n"); + fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0); } int main(int argc, char ** argv){ if (argc < 3) { - print_usage(); + print_usage(argv[0]); exit(1); } @@ -27,7 +27,7 @@ int main(int argc, char ** argv){ for (int i = 0; i < argc-1; ++i) { args[i] = argv[i+1]; if (args[i] == "-h" || args[i] == "--help") { - print_usage(); + print_usage(argv[0]); exit(0); } } diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index deb77d588ea9f..a97ded3653f0c 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -1,12 +1,12 @@ cmake_minimum_required(VERSION 3.12) -project("main-cmake-pkg" C CXX) -set(TARGET main-cmake-pkg) +project("llama-cli-cmake-pkg" C CXX) +set(TARGET llama-cli-cmake-pkg) find_package(Llama 0.0.1 REQUIRED) # Bake common functionality in with target. Because applications # using the relocatable Llama package should be outside of the -# source tree, main-cmake-pkg pretends the dependencies are built-in. +# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in. set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") add_library(common OBJECT) file(GLOB _common_files @@ -15,7 +15,7 @@ file(GLOB _common_files ) target_sources(common PRIVATE ${_common_files}) -# If the common project was part of "main-cmake-pkg" the transient +# If the common project was part of "llama-cli-cmake-pkg" the transient # defines would automatically be attached. Because the common func- # tionality is separate, but dependent upon the defines, it must be # explicitly extracted from the "llama" target. diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md index a88e92f23981f..08d83dd08636a 100644 --- a/examples/main-cmake-pkg/README.md +++ b/examples/main-cmake-pkg/README.md @@ -1,6 +1,6 @@ # llama.cpp/example/main-cmake-pkg -This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. +This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. ## Building @@ -20,7 +20,7 @@ cmake --build build --config Release cmake --install build --prefix C:/LlamaCPP ``` -### Build main-cmake-pkg +### Build llama-cli-cmake-pkg ```cmd diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index d532980b76da8..5f6efaa9aa94b 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET main) +set(TARGET llama-cli) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/main/README.md b/examples/main/README.md index cdc002f151d4a..61e4a42f7e02c 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,4 +1,4 @@ -# llama.cpp/example/main +# llama.cpp/examples/main This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. @@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin --prompt "Once upon a time" +./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time" ``` #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time" +llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time" ``` For an interactive experience, try this command: @@ -34,7 +34,7 @@ For an interactive experience, try this command: #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ +./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ 'User: Hi AI: Hello. I am an AI chatbot. Would you like to talk? User: Sure! @@ -45,7 +45,7 @@ User:' #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" ``` The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -53,18 +53,18 @@ The following command generates "infinite" text from a starting prompt (you can #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin --ignore-eos -n -1 +./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1 ``` #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 +llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 ``` ## Common Options -In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: +In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models: - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). @@ -74,7 +74,7 @@ In this section, we cover the most commonly used options for running the `main` ## Input Prompts -The `main` program provides several ways to interact with the LLaMA models using input prompts: +The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts: - `--prompt PROMPT`: Provide a prompt directly as a command-line option. - `--file FNAME`: Provide a file containing a prompt or multiple prompts. @@ -82,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using ## Interaction -The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`. +The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`. In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing. @@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: ```sh -./main -r "User:" --in-prefix " " +./llama-cli -r "User:" --in-prefix " " ``` ### In-Suffix @@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: ```sh -./main -r "User:" --in-prefix " " --in-suffix "Assistant:" +./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:" ``` ## Context Management diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt index 319535a6e9054..c13557bac2bac 100644 --- a/examples/parallel/CMakeLists.txt +++ b/examples/parallel/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET parallel) +set(TARGET llama-parallel) add_executable(${TARGET} parallel.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt index 3161bf3ef9a45..dc467a5d3e411 100644 --- a/examples/passkey/CMakeLists.txt +++ b/examples/passkey/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET passkey) +set(TARGET llama-passkey) add_executable(${TARGET} passkey.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 9e7a119ba3e0b..a48a6283a0b27 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -8,5 +8,5 @@ See the following PRs for more info: ### Usage ```bash -make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 +make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 ``` diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 3c76d3221416b..be0f2fd029e67 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET perplexity) +set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 0bd78c21a86a1..efde8dfdff47b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip - // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index e31cf5e3809c1..bb986a716883d 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET quantize-stats) +set(TARGET llama-quantize-stats) add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 6b977fde86ab2..3ee4eb9719fc4 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET quantize) +set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh index 38e28ffc365ee..24bc970e8632b 100644 --- a/examples/quantize/tests.sh +++ b/examples/quantize/tests.sh @@ -18,9 +18,9 @@ fi set -x -SPLIT=$1/gguf-split -QUANTIZE=$1/quantize -MAIN=$1/main +SPLIT=$1/llama-gguf-split +QUANTIZE=$1/llama-quantize +MAIN=$1/llama-cli WORK_PATH=$TMP_DIR/quantize ROOT_DIR=$(realpath $(dirname $0)/../../) diff --git a/examples/reason-act.sh b/examples/reason-act.sh index 046c48db584bc..06d592799cf12 100755 --- a/examples/reason-act.sh +++ b/examples/reason-act.sh @@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then MODEL="-m $2 " fi -./main $MODEL --color \ +./llama-cli $MODEL --color \ -f ./prompts/reason-act.txt \ -i --interactive-first \ --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt index eaabae08d5583..66610f3111405 100644 --- a/examples/retrieval/CMakeLists.txt +++ b/examples/retrieval/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET retrieval) +set(TARGET llama-retrieval) add_executable(${TARGET} retrieval.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md index 2b2595c468046..bc5f22e2ff156 100644 --- a/examples/retrieval/README.md +++ b/examples/retrieval/README.md @@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193 `retrieval` example can be tested as follows: ```bash -make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . +make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . ``` This chunks and embeds all given files and starts a loop requesting query inputs: diff --git a/examples/rpc/README.md b/examples/rpc/README.md index eeec71a8ee0c2..86544e3fea2c3 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -70,5 +70,5 @@ cmake --build . --config Release Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: ```bash -$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 +$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 ``` diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index cc6ed8554a6e3..0fb5e359bc9ad 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET save-load-state) +set(TARGET llama-save-load-state) add_executable(${TARGET} save-load-state.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh index 17fedc2b176f6..4ce79b7fac477 100755 --- a/examples/server-llama2-13B.sh +++ b/examples/server-llama2-13B.sh @@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS -./server $GEN_OPTIONS \ +./llama-server $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --rope-freq-scale 1.0 \ diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index dab709619ae34..8365f9510ceae 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET server) +set(TARGET llama-server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/examples/server/README.md b/examples/server/README.md index ccbdcdbdb2ddb..e7fb0bf64c0e1 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co ## Build -`server` is built alongside everything else from the root of the project +`llama-server` is built alongside everything else from the root of the project - Using `make`: ```bash - make server + make llama-server ``` - Using `CMake`: ```bash cmake -B build - cmake --build build --config Release -t server + cmake --build build --config Release -t llama-server ``` - Binary is at `./build/bin/server` + Binary is at `./build/bin/llama-server` ## Build with SSL -`server` can also be built with SSL support using OpenSSL 3 +`llama-server` can also be built with SSL support using OpenSSL 3 - Using `make`: @@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co # NOTE: For non-system openssl, use the following: # CXXFLAGS="-I /path/to/openssl/include" # LDFLAGS="-L /path/to/openssl/lib" - make LLAMA_SERVER_SSL=true server + make LLAMA_SERVER_SSL=true llama-server ``` - Using `CMake`: ```bash cmake -B build -DLLAMA_SERVER_SSL=ON - cmake --build build --config Release -t server + cmake --build build --config Release -t llama-server ``` ## Quick Start @@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.) ```bash -./server -m models/7B/ggml-model.gguf -c 2048 +./llama-server -m models/7B/ggml-model.gguf -c 2048 ``` ### Windows ```powershell -server.exe -m models\7B\ggml-model.gguf -c 2048 +llama-server.exe -m models\7B\ggml-model.gguf -c 2048 ``` The above command will start a server that by default listens on `127.0.0.1:8080`. @@ -629,11 +629,11 @@ bash chat.sh ### OAI-like API -The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi +The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi ### API errors -`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi +`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi Example of an error: diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 23a3ec97523ef..0f18ca39651d2 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -99,7 +99,7 @@ The `bench.py` script does several steps: It aims to be used in the CI, but you can run it manually: ```shell -LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ +LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \ --runner-label local \ --name local \ --branch `git rev-parse --abbrev-ref HEAD` \ diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 86c5de101445c..4fbbb203267f4 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -245,7 +245,7 @@ def start_server(args): def start_server_background(args): # Start the server - server_path = '../../../build/bin/server' + server_path = '../../../build/bin/llama-server' if 'LLAMA_SERVER_BIN_PATH' in os.environ: server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_args = [ diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md index 36a46885d71df..2dc1778255256 100644 --- a/examples/server/public_simplechat/readme.md +++ b/examples/server/public_simplechat/readme.md @@ -44,12 +44,12 @@ http module. ### running using examples/server -bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT] +./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT] ### running using python3's server module first run examples/server -* bin/server -m path/model.gguf +* ./llama-server -m path/model.gguf next run this web front end in examples/server/public_simplechat * cd ../examples/server/public_simplechat diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 83c0208f36c09..5e6cb277bc813 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`. ```shell cd ../../.. -mkdir build -cd build -cmake -DLLAMA_CURL=ON ../ -cmake --build . --target server +cmake -B build -DLLAMA_CURL=ON +cmake --build build --target llama-server ``` 2. Start the test: `./tests.sh` @@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables: | variable | description | |--------------------------|------------------------------------------------------------------------------------------------| | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | -| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` | +| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | | `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | | `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 26d9359d7f3f8..7b5dabb019e75 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1272,9 +1272,9 @@ def context_text(context): def start_server_background(context): if os.name == 'nt': - context.server_path = '../../../build/bin/Release/server.exe' + context.server_path = '../../../build/bin/Release/llama-server.exe' else: - context.server_path = '../../../build/bin/server' + context.server_path = '../../../build/bin/llama-server' if 'LLAMA_SERVER_BIN_PATH' in os.environ: context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_listen_addr = context.server_fqdn diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 7da5ff6f3ac04..070cfbe7ad525 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET simple) +set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt index 810f3c46ac4aa..aa208e7aaeeb0 100644 --- a/examples/speculative/CMakeLists.txt +++ b/examples/speculative/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET speculative) +set(TARGET llama-speculative) add_executable(${TARGET} speculative.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index 69cf8932eb5c7..e4d5083e6e502 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: MIT -set(TARGET ls-sycl-device) +set(TARGET llama-ls-sycl-device) add_executable(${TARGET} ls-sycl-device.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/sycl/README.md b/examples/sycl/README.md index c589c2d3a489c..0e3acd35b616a 100644 --- a/examples/sycl/README.md +++ b/examples/sycl/README.md @@ -6,9 +6,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU. |Tool Name| Function|Status| |-|-|-| -|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support| +|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support| -### ls-sycl-device +### llama-ls-sycl-device List all SYCL devices with ID, compute capability, max work group size, ect. @@ -23,7 +23,7 @@ source /opt/intel/oneapi/setvars.sh 3. Execute ``` -./build/bin/ls-sycl-device +./build/bin/llama-ls-sycl-device ``` Check the ID in startup log, like: diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 7b39a18c0681d..da0e4aaba688c 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -23,15 +23,15 @@ fi if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 fi #use main GPU only -#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none #use multiple GPUs with same max compute units -#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt index 5e6654d7e5988..b704dcae18c52 100644 --- a/examples/tokenize/CMakeLists.txt +++ b/examples/tokenize/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET tokenize) +set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt index 4459516d093d6..9a1d2a35e7731 100644 --- a/examples/train-text-from-scratch/CMakeLists.txt +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET train-text-from-scratch) +set(TARGET llama-train-text-from-scratch) add_executable(${TARGET} train-text-from-scratch.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index 1b3454069e9a3..3abae238036dc 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -7,7 +7,7 @@ Basic usage instructions: wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt # train -./bin/train-text-from-scratch \ +./bin/llama-train-text-from-scratch \ --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ --checkpoint-in chk-shakespeare-256x16-LATEST.gguf \ @@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s --no-checkpointing # predict -./bin/main -m ggml-shakespeare-256x16-f32.gguf +./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf ``` Output files will be saved every N iterations (config with `--save-every N`). diff --git a/flake.nix b/flake.nix index 0a52ea52ea9d9..c69637d111784 100644 --- a/flake.nix +++ b/flake.nix @@ -63,7 +63,7 @@ # nix-repl> :lf github:ggerganov/llama.cpp # Added 13 variables. # nix-repl> outputs.apps.x86_64-linux.quantize - # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; } + # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; } # ``` outputs = { self, flake-parts, ... }@inputs: diff --git a/grammars/README.md b/grammars/README.md index 2ec21a4c091cf..2f685eb6de004 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -91,7 +91,7 @@ item ::= [^\n]+ "\n" This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with: ``` -./main -m --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' +./llama-cli -m --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' ``` `llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below. @@ -110,20 +110,20 @@ While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) ma You can use GBNF grammars: -- In the [server](../examples/server)'s completion endpoints, passed as the `grammar` body field -- In the [main](../examples/main) CLI, passed as the `--grammar` & `--grammar-file` flags -- With the [gbnf-validator](../examples/gbnf-validator) tool, to test them against strings. +- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field +- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags +- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings. ## JSON Schemas → GBNF `llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars: -- In the [server](../examples/server): +- In [llama-server](../examples/server): - For any completion endpoints, passed as the `json_schema` body field - For the `/chat/completions` endpoint, passed inside the `result_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`) -- In the [main](../examples/main) CLI, passed as the `--json` / `-j` flag +- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag - To convert to a grammar ahead of time: - - in CLI, with [json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) + - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt index fb89a1cd4e833..d5405ad2991d1 100644 --- a/pocs/vdot/CMakeLists.txt +++ b/pocs/vdot/CMakeLists.txt @@ -1,9 +1,9 @@ -set(TARGET vdot) +set(TARGET llama-vdot) add_executable(${TARGET} vdot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -set(TARGET q8dot) +set(TARGET llama-q8dot) add_executable(${TARGET} q8dot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh index 121979fe2b069..4e1b1cc15f01a 100755 --- a/scripts/get-hellaswag.sh +++ b/scripts/get-hellaswag.sh @@ -4,7 +4,7 @@ wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" +echo " ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" echo "" exit 0 diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh index 880dd5cbe2230..9c65fafbcc50b 100755 --- a/scripts/get-wikitext-103.sh +++ b/scripts/get-wikitext-103.sh @@ -4,7 +4,7 @@ wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1. echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo " ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]" echo "" exit 0 diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh index b01476a46015a..5f3845ef59a9e 100755 --- a/scripts/get-wikitext-2.sh +++ b/scripts/get-wikitext-2.sh @@ -5,7 +5,7 @@ unzip wikitext-2-raw-v1.zip echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" +echo " ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" echo "" exit 0 diff --git a/scripts/get-winogrande.sh b/scripts/get-winogrande.sh index 5f234468e1643..f1fc0e2d47adb 100755 --- a/scripts/get-winogrande.sh +++ b/scripts/get-winogrande.sh @@ -4,7 +4,7 @@ wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]" +echo " ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]" echo "" exit 0 diff --git a/scripts/hf.sh b/scripts/hf.sh index 58f83d6fe92a5..85c2c4d9a952e 100755 --- a/scripts/hf.sh +++ b/scripts/hf.sh @@ -3,9 +3,9 @@ # Shortcut for downloading HF models # # Usage: -# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) # # all logs go to stderr diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 5dabbf60e6fc8..6ba499a2a2521 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -77,9 +77,9 @@ if [ "$1" -eq "1" ]; then python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "2" ]; then @@ -92,9 +92,9 @@ if [ "$1" -eq "2" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "3" ]; then @@ -107,9 +107,9 @@ if [ "$1" -eq "3" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "4" ]; then @@ -122,9 +122,9 @@ if [ "$1" -eq "4" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "5" ]; then @@ -137,9 +137,9 @@ if [ "$1" -eq "5" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "6" ]; then @@ -152,9 +152,9 @@ if [ "$1" -eq "6" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "7" ]; then @@ -167,9 +167,9 @@ if [ "$1" -eq "7" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "1" ]; then @@ -181,22 +181,22 @@ if [ "$1" -eq "1" ]; then ../scripts/get-wikitext-2.sh unzip wikitext-2-raw-v1.zip - make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 + make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 # batched cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh index b4c2a159e2bf5..bc43738a2f498 100755 --- a/scripts/qnt-all.sh +++ b/scripts/qnt-all.sh @@ -26,5 +26,5 @@ set -e mkdir -p ${out} for q in ${qnt[@]}; do - time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt + time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt done diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh index e04d61d7fe091..e15f74f1b666d 100755 --- a/scripts/run-all-ppl.sh +++ b/scripts/run-all-ppl.sh @@ -26,5 +26,5 @@ out="../tmp/results-${model}" mkdir -p ${out} for q in ${qnt[@]}; do - time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt + time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt done diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index 0d7219113ec3f..ee21eab371418 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -10,7 +10,7 @@ logger = logging.getLogger("run-with-preset") -CLI_ARGS_MAIN_PERPLEXITY = [ +CLI_ARGS_LLAMA_CLI_PERPLEXITY = [ "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", @@ -29,7 +29,7 @@ "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose" ] -CLI_ARGS_SERVER = [ +CLI_ARGS_LLAMA_SERVER = [ "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base", "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", @@ -37,7 +37,7 @@ ] description = """Run llama.cpp binaries with presets from YAML file(s). -To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported). +To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported). To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument. Formatting considerations: @@ -77,19 +77,19 @@ props = {prop.replace("_", "-"): val for prop, val in props.items()} -binary = props.pop("binary", "main") +binary = props.pop("binary", "llama-cli") if known_args.binary: binary = known_args.binary if os.path.exists(f"./{binary}"): binary = f"./{binary}" -if binary.lower().endswith("main") or binary.lower().endswith("perplexity"): - cli_args = CLI_ARGS_MAIN_PERPLEXITY +if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"): + cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY elif binary.lower().endswith("llama-bench"): cli_args = CLI_ARGS_LLAMA_BENCH -elif binary.lower().endswith("server"): - cli_args = CLI_ARGS_SERVER +elif binary.lower().endswith("llama-server"): + cli_args = CLI_ARGS_LLAMA_SERVER else: logger.error(f"Unknown binary: {binary}") sys.exit(1) diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index b3715e204b691..199232440449b 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,13 +380,13 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUDA=1 make -j server $log + LLAMA_CUDA=1 make -j llama-server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" - make -j server $log + make -j llama-server $log elif [[ "$backend" == "metal" ]]; then printf "[+] Building with Metal backend\n" - make -j server $log + make -j llama-server $log else printf "[-] Unknown backend: %s\n" "$backend" exit 1 @@ -413,6 +413,6 @@ if [[ $verbose -eq 1 ]]; then args="$args --verbose" fi -./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args +./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args exit 0 From f578b86b2123d0f92afbaa98a031df4d4464e582 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 03:11:35 +0200 Subject: [PATCH 2/4] move BLAS to a separate backend (#6210) * move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 23 +- Makefile | 27 +- examples/llama-bench/llama-bench.cpp | 1 + ggml-alloc.c | 98 ++++++-- ggml-backend-impl.h | 28 ++- ggml-backend.c | 242 +++++++++++++----- ggml-backend.h | 6 +- ggml-blas.cpp | 363 +++++++++++++++++++++++++++ ggml-blas.h | 23 ++ ggml-cuda.cu | 44 ++-- ggml-kompute.cpp | 13 +- ggml-metal.m | 15 +- ggml-rpc.cpp | 21 +- ggml-sycl.cpp | 28 +-- ggml-vulkan.cpp | 26 +- ggml.c | 205 ++------------- llama.cpp | 37 ++- 17 files changed, 821 insertions(+), 379 deletions(-) create mode 100644 ggml-blas.cpp create mode 100644 ggml-blas.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e280f87d9dd1..08481334f18f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,8 +39,12 @@ endif() if (APPLE) set(LLAMA_METAL_DEFAULT ON) + set(LLAMA_BLAS_DEFAULT ON) + set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") else() set(LLAMA_METAL_DEFAULT OFF) + set(LLAMA_BLAS_DEFAULT OFF) + set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") endif() set(LLAMA_LLAMAFILE_DEFAULT ON) @@ -91,9 +95,10 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) +option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) +set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING + "llama: BLAS library vendor") option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) -set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) @@ -311,9 +316,9 @@ if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) endif() - if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - set(BLA_SIZEOF_INTEGER 8) - endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) find_package(BLAS) @@ -321,7 +326,7 @@ if (LLAMA_BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) @@ -374,12 +379,15 @@ if (LLAMA_BLAS) add_compile_options(${BLAS_LINKER_FLAGS}) - add_compile_definitions(GGML_USE_OPENBLAS) + add_compile_definitions(GGML_USE_BLAS) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() + set(GGML_HEADERS_BLAS ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.cpp) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() @@ -1258,6 +1266,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) diff --git a/Makefile b/Makefile index a4cab1bb208bc..744fe5739e95c 100644 --- a/Makefile +++ b/Makefile @@ -440,10 +440,11 @@ ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate + OBJS += ggml-blas.o endif endif # LLAMA_NO_ACCELERATE @@ -454,21 +455,30 @@ ifndef LLAMA_NO_OPENMP endif # LLAMA_NO_OPENMP ifdef LLAMA_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) + OBJS += ggml-blas.o endif # LLAMA_OPENBLAS -ifndef LLAMA_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJS += sgemm.o -endif +ifdef LLAMA_OPENBLAS64 + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + MK_LDFLAGS += $(shell pkg-config --libs openblas64) + OBJS += ggml-blas.o +endif # LLAMA_OPENBLAS64 ifdef LLAMA_BLIS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib + OBJS += ggml-blas.o endif # LLAMA_BLIS +ifndef LLAMA_NO_LLAMAFILE + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJS += sgemm.o +endif + ifdef LLAMA_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJS += ggml-rpc.o @@ -776,6 +786,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml-blas.o: ggml-blas.cpp ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 61f5a5a0928a2..61dd1d71ab5e9 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; for (int i = 1; i < argc; i++) { arg = argv[i]; diff --git a/ggml-alloc.c b/ggml-alloc.c index eb75962d49cc3..bd367c42df44e 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -339,6 +339,7 @@ struct hash_node { }; struct tensor_alloc { + int buffer_id; size_t offset; size_t size_max; // 0 = pre-allocated, unused, or view }; @@ -349,7 +350,6 @@ struct leaf_alloc { }; struct node_alloc { - int buffer_id; struct tensor_alloc dst; struct tensor_alloc src[GGML_MAX_SRC]; }; @@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs for (int i = 0; i < n_bufs; i++) { galloc->bufts[i] = bufts[i]; galloc->buffers[i] = NULL; - size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); - galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + + // check if the same buffer type is used multiple times and reuse the same allocator + for (int j = 0; j < i; j++) { + if (bufts[i] == bufts[j]) { + galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; + break; + } + } + + if (galloc->buf_tallocs[i] == NULL) { + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } } galloc->n_buffers = n_bufs; @@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { for (int i = 0; i < galloc->n_buffers; i++) { if (galloc->buffers != NULL) { - ggml_backend_buffer_free(galloc->buffers[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buffers[j] == galloc->buffers[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_backend_buffer_free(galloc->buffers[i]); + } } if (galloc->buf_tallocs != NULL) { - ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } } @@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor } } -static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { // graph outputs are never freed if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { AT_PRINTF("not freeing output %s\n", node->name); return; } - struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; - ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); size_t offset = hn->offset; + int buffer_id = hn->buffer_id; + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; size_t size = ggml_backend_buft_get_alloc_size(buft, node); ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); hn->allocated = false; @@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { - ggml_gallocr_free_node(galloc, view_src, buffer_id); + ggml_gallocr_free_node(galloc, view_src); } } else if (p_hn->allocated) { - ggml_gallocr_free_node(galloc, parent, buffer_id); + ggml_gallocr_free_node(galloc, parent); } } AT_PRINTF("\n"); @@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); if (node->view_src || node->data) { + node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - node_alloc->dst.offset = hn->offset; - node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + node_alloc->dst.buffer_id = hn->buffer_id; + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (!src || src->view_src || src->data) { + node_alloc->src[j].buffer_id = -1; node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].buffer_id = hn->buffer_id; node_alloc->src[j].offset = hn->offset; node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); } @@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { + galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.size_max = 0; } else { + galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); } @@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { + // if the buffer type is used multiple times, we reuse the same buffer + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + galloc->buffers[i] = galloc->buffers[j]; + break; + } + } + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); @@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c #ifndef NDEBUG fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif + ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { @@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { + int buffer_id = tensor_alloc->buffer_id; assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); if (tensor->view_src != NULL) { @@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } } -static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); return talloc->size_max >= node_size; } @@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { + if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); #endif @@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (src == NULL) { continue; } - if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { + if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif @@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; - ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); + ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); } // nodes for (int i = 0; i < graph->n_nodes; i++) { @@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (src == NULL) { continue; } - ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); } return true; @@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { if (galloc->buffers[buffer_id] == NULL) { return 0; } + + for (int i = 0; i < buffer_id; i++) { + if (galloc->buffers[i] == galloc->buffers[buffer_id]) { + // this buffer is the same as a previous one due to the same buffer type being used multiple times + // only return the buffer size the first time it appears to avoid double counting + return 0; + } + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f121e1de420fa..36ca370867c9e 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -17,13 +17,15 @@ extern "C" { struct ggml_backend_buffer_type_i { const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + // allocate a buffer of this type ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // tensor alignment + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); + // max buffer size that can be allocated + size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); + // data size needed to allocate the tensor, including padding + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // check if tensor data is in host memory - // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); }; @@ -92,27 +94,37 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) + // create a new plan for a graph ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology + void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); + // compute the graph with the plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph with a plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - // check if the backend supports an operation + // check if the backend can compute an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend can use tensors allocated in a buffer type + bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // these should be expensive operations with large batch sizes that may benefit from running on this backend // even if the weight has to be copied from the CPU temporarily bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); // (optional) event synchronization + // create a new event that can record events on this backend instance ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); + // record an event on the backend instance that created it void (*GGML_CALL event_record) (ggml_backend_event_t event); + // wait for an event on on a different backend instance void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + // block until an event is recorded void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); }; diff --git a/ggml-backend.c b/ggml-backend.c index 05737ed696954..2bec7bea38a85 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf return ggml_nbytes(tensor); } -bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return buft->iface.supports_backend(buft, backend); -} - bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { if (buft->iface.is_host) { return buft->iface.is_host(buft); @@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return backend->iface.supports_buft(backend, buft); +} + bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); @@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_ GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cpu(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + static struct ggml_backend_i cpu_backend_i = { /* .get_name = */ ggml_backend_cpu_name, /* .free = */ ggml_backend_cpu_free, @@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .supports_buft = */ ggml_backend_cpu_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, @@ -1055,6 +1055,9 @@ struct ggml_backend_sched { int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] + int * prev_node_backend_ids; // [graph_size] + int * prev_leaf_backend_ids; // [graph_size] + // copy of the graph with modified inputs struct ggml_cgraph * graph; @@ -1075,6 +1078,8 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; + bool debug; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) @@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen return -1; } -static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { ggml_backend_buffer_t buffer = tensor->buffer; if (buffer == NULL) { return -1; } - // find highest prio backend that supports the buffer type + // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { - if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { + if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && + ggml_backend_supports_op(sched->backends[i], op)) { return i; } } - fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n", - __func__, ggml_backend_buffer_name(buffer), tensor->name); - GGML_ASSERT(false); +#ifndef NDEBUG + fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); +#endif return -1; } @@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); return cur_backend_id; @@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // view_src if (tensor->view_src != NULL) { - cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); return cur_backend_id; @@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op if (src_backend_id == sched->n_backends - 1) { for (int b = 0; b < src_backend_id; b++) { @@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -//#define DEBUG_PASS1 -//#define DEBUG_PASS2 -//#define DEBUG_PASS3 -//#define DEBUG_PASS4 +static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) { + ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; + ggml_backend_buffer_type_t buft = NULL; + + if (buf) { + // the tensor is already allocated + buft = buf->buft; + } else { + // see if the tensor already has a backend assigned, and use the buffer type of that backend + int tensor_backend_id = tensor_backend_id(t); + if (tensor_backend_id == -1 && t->view_src) { + tensor_backend_id = tensor_backend_id(t->view_src); + } + if (tensor_backend_id != -1) { + buft = sched->bufts[tensor_backend_id]; + } + } + + return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft); +} + +static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { + if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { + *node_backend_id = cur_backend_id; + SET_CAUSE(node, "2.sup"); + } +} // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { @@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS1 - fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 2: expand current backend assignments // assign the same backend to adjacent nodes // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - - - // pass 2.2 expand gpu down + // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known + // expand gpu down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.2"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.1 expand gpu up + // expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.1"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.4 expand rest down + // expand rest down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.4"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.3 expand rest up + // expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.3"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } -#ifdef DEBUG_PASS2 - fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif + // pass 3: upgrade nodes to higher prio backends with compatible buffer types + // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there + // however, we also need to verify that the sources are in compatible buffer types + // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph + // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same + // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) + // additionally, set remaining unassigned nodes to the backend with the most supported inputs + // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id == -1) { + // unassigned node: find the backend with the most supported inputs + int n_supported_best = -1; + for (int b = 0; b < sched->n_backends; b++) { + if (ggml_backend_supports_op(sched->backends[b], node)) { + int n_supported = 0; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) { + n_supported++; + } + } + if (n_supported > n_supported_best) { + n_supported_best = n_supported; + *node_backend_id = b; + SET_CAUSE(node, "3.best"); + } + } + } + } else { + // assigned node: upgrade to higher prio backend if possible + for (int b = 0; b < *node_backend_id; b++) { + if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { + bool supported = true; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!ggml_backend_sched_buffer_supported(sched, src, b)) { + supported = false; + break; + } + } + if (supported) { + *node_backend_id = b; + SET_CAUSE(node, "3.upg"); + break; + } + } + } + } + } - // pass 3: assign backends to remaining src from dst and view_src + // pass 4: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "3.vsrc"); + SET_CAUSE(node, "4.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src->view_src != NULL) { // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); - SET_CAUSE(src, "3.vsrc"); + SET_CAUSE(src, "4.vsrc"); } else { *src_backend_id = *cur_backend_id; - SET_CAUSE(src, "3.cur"); + SET_CAUSE(src, "4.cur"); } } } } -#ifdef DEBUG_PASS3 - fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 4: split graph, find tensors that need to be copied { @@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } // check if the split has too many inputs + // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->tensor_backend_id[id]; - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != node_backend_id) { + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && !supported) { // create a copy of the input in the split's backend const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { @@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->i_end = graph->n_nodes; sched->n_splits = i_split + 1; } -#ifdef DEBUG_PASS4 - fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif + + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, graph); + } + + // swap node_backend_ids and leaf_backend_ids and prevs + { + int * tmp = sched->node_backend_ids; + sched->node_backend_ids = sched->prev_node_backend_ids; + sched->prev_node_backend_ids = tmp; + + tmp = sched->leaf_backend_ids; + sched->leaf_backend_ids = sched->prev_leaf_backend_ids; + sched->prev_leaf_backend_ids = tmp; + } // create copies of the graph for each split // TODO: avoid this copy @@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { + bool backend_ids_changed = false; + for (int i = 0; i < sched->graph->n_nodes; i++) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + if (!backend_ids_changed) { + for (int i = 0; i < sched->graph->n_leafs; i++) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + } + // allocate graph - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG @@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); @@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new( const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); + sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->n_backends = n_backends; @@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new( for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); - GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); + GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { sched->events[b][c] = ggml_backend_event_new(backends[b]); @@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); + free(sched->prev_node_backend_ids); + free(sched->prev_leaf_backend_ids); free(sched); } @@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; + SET_CAUSE(node, "usr"); } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { diff --git a/ggml-backend.h b/ggml-backend.h index c582b06850ed1..47fd814751795 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -23,7 +23,6 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer @@ -74,6 +73,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends @@ -90,7 +90,7 @@ extern "C" { GGML_API void ggml_backend_event_free (ggml_backend_event_t event); GGML_API void ggml_backend_event_record (ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event + GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // // CPU backend @@ -119,7 +119,7 @@ extern "C" { GGML_API size_t ggml_backend_reg_get_count(void); GGML_API size_t ggml_backend_reg_find_by_name(const char * name); - GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] + GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); diff --git a/ggml-blas.cpp b/ggml-blas.cpp new file mode 100644 index 0000000000000..d709a357bbf29 --- /dev/null +++ b/ggml-blas.cpp @@ -0,0 +1,363 @@ +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include +#include + +#if defined(GGML_USE_ACCELERATE) +# include +#elif defined(GGML_BLAS_USE_MKL) +# include +#else +# include +# ifdef BLIS_ENABLE_CBLAS +# include +# endif +#endif + +struct ggml_backend_blas_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} + +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); + ggml_to_float_t const to_float = type_traits.to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(BLIS_ENABLE_CBLAS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + CBLAS_TRANSPOSE transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + +// backend interface + +GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(backend); +} + +GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || + (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_name, + /* .free = */ ggml_backend_blas_free, + /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .supports_op = */ ggml_backend_blas_supports_op, + /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_blas_guid(), + /* .interface = */ blas_backend_i, + /* .context = */ ctx, + }; + +#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + + return backend; +} + +GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} diff --git a/ggml-blas.h b/ggml-blas.h new file mode 100644 index 0000000000000..f2e37de06f609 --- /dev/null +++ b/ggml-blas.h @@ -0,0 +1,23 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); + +GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations +GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c6bc3f64c90de..64d3b6747fc41 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu return ctx->name.c_str(); } +static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; @@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_cuda(backend)) { - return false; - } - - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - return buft_ctx->device == cuda_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; @@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cuda_split(buft)) { + return true; + } + + if (ggml_backend_buft_is_cuda(buft)) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + return buft_ctx->device == cuda_ctx->device; + } + + return false; +} + GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; @@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .synchronize = */ ggml_backend_cuda_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .supports_buft = */ ggml_backend_cuda_supports_buft, /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 18c6f4a104f36..ed5f2e3494ba4 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_ return ctx->max_alloc; } -static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - GGML_UNUSED(buft); - return ggml_backend_is_kompute(backend); -} - static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc return ggml_vk_supports_op(op); } +static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(backend); + return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name; +} + static struct ggml_backend_i kompute_backend_i = { /* .get_name = */ ggml_backend_kompute_name, /* .free = */ ggml_backend_kompute_free, @@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .supports_buft = */ ggml_backend_kompute_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index b5c287347e7c9..ec9e95302096c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend UNUSED(buft); } -GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, @@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con return ggml_metal_supports_op(metal_ctx, op); } +GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + + UNUSED(backend); +} + static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, @@ -3189,9 +3188,11 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .supports_buft = */ ggml_backend_metal_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 679ce4f280c5f..9b95193d3229d 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend return ggml_nbytes(tensor); } -GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_rpc(backend)) { - return false; - } - ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; - return buft_ctx->endpoint == rpc_ctx->endpoint; -} - static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { /* .get_name = */ ggml_backend_rpc_buffer_type_name, /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_rpc_get_max_size, /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const return false; } +GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + return buft_ctx->endpoint == rpc_ctx->endpoint; +} + static ggml_backend_i ggml_backend_rpc_interface = { /* .get_name = */ ggml_backend_rpc_name, /* .free = */ ggml_backend_rpc_free, @@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .synchronize = */ ggml_backend_rpc_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .supports_op = */ ggml_backend_rpc_supports_op, + /* .supports_buft = */ ggml_backend_rpc_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index e7d260bd4ebe3..6f41ed2723794 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen UNUSED(buft); } -GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_sycl(backend)) { - return false; - } - ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - return buft_ctx->device == sycl_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend, /* .is_host = */ nullptr, }; @@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_sycl(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, }; @@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) { + return false; + } + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + return buft_ctx->device == sycl_ctx->device; +} static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, @@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .supports_buft = */ ggml_backend_sycl_supports_buft, /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5b92804915658..e2d17a3523a48 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_ UNUSED(buft); } -GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_vk(backend)) { - return false; - } - - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return buft_ctx->ctx->idx == ctx->idx; -} - static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g UNUSED(backend); } +GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return buft_ctx->ctx->idx == ctx->idx; +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .supports_buft = */ ggml_backend_vk_supports_buft, /* .offload_op = */ ggml_backend_vk_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml.c b/ggml.c index 2ea1d76773893..d5d33c2ba1029 100644 --- a/ggml.c +++ b/ggml.c @@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #if defined(GGML_USE_ACCELERATE) #include -#elif defined(GGML_USE_OPENBLAS) -#if defined(GGML_BLAS_USE_MKL) -#include -#else -#include -#endif #endif // floating point type used to accumulate sums @@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) - // all the experts for each batch element and the processing would become incredibly slow - // TODO: find the optimal values for these - if (dst->op != GGML_OP_MUL_MAT_ID && - ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - //src0->type == GGML_TYPE_F32 && - src1->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - static void ggml_compute_forward_mul_mat_one_chunk( const struct ggml_compute_params * params, struct ggml_tensor * dst, @@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(dst)) { - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); - UNUSED(desired_wsize); - - if (params->type == GGML_TASK_TYPE_INIT) { - if (type != GGML_TYPE_F32) { - assert(params->wsize >= desired_wsize); - // parallelize by src0 rows - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - // broadcast src0 into src1 across 2nd,3rd dimension - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - ggml_to_float_t const to_float = type_traits[type].to_float; - - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); - } - } - } - } - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // perform sgemm, parallelization controlled by blas lib - if (ith != 0) { - return; - } - - //const int64_t tgemm0 = ggml_perf_time_us(); - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - if (type != GGML_TYPE_F32) { - x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - } - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne1, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); - - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - #if GGML_USE_LLAMAFILE const bool src1_cont = ggml_is_contiguous(src1); @@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - bool use_blas = ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); -#endif - if (params->type == GGML_TASK_TYPE_INIT) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst - if (use_blas) { - return; - } -#endif if (ith != 0) { return; } @@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32( return; } -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (use_blas) { - if (params->ith != 0) { // All threads other than the first do no work. - return; - } - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - int transposeA, lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - return; - } -#endif - // dst[:,:,:,:] = 0 // for i2,i3: // for i1: @@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; @@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_fp16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13477,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - ggml_bf16_to_fp32_row( + assert(i01 >= 0 && i01 < ne01); + + ggml_bf16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); @@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: + case GGML_OP_CONT: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_ACC: @@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ } break; case GGML_OP_SCALE: case GGML_OP_SET: - case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -19137,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput sched_yield(); } - * node_n = atomic_load(&state->shared->node_n); - if (* node_n != last_node_n) break; + *node_n = atomic_load(&state->shared->node_n); + if (*node_n != last_node_n) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19148,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { // wait for other threads to finish - const int last_task_phase = * task_phase; + const int last_task_phase = *task_phase; while (true) { if (do_yield) { sched_yield(); } - * task_phase = atomic_load(&state->shared->node_task); - if (* task_phase != last_task_phase) break; + *task_phase = atomic_load(&state->shared->node_task); + if (*task_phase != last_task_phase) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa { const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node)) { - if (node->src[0]->type != GGML_TYPE_F32) { - // here we need memory for fully dequantized matrix from src0 - // take into account that src0 can be broadcasted into src1[2,3] - cur = ggml_type_size(GGML_TYPE_F32) - * node->src[0]->ne[0]*node->src[0]->ne[1] - * node->src[1]->ne[2]*node->src[1]->ne[3]; - } - } else -#endif if (node->src[1]->type != vec_dot_type) { cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } @@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; diff --git a/llama.cpp b/llama.cpp index 8b675ea993a38..225ea977f4612 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,6 +21,10 @@ # include "ggml-kompute.h" #endif +#ifdef GGML_USE_BLAS +# include "ggml-blas.h" +#endif + #ifdef GGML_USE_METAL # include "ggml-metal.h" #endif @@ -2299,9 +2303,13 @@ struct llama_context { std::vector backends; #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; +#endif +#ifdef GGML_USE_BLAS + ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -11529,7 +11537,8 @@ static struct ggml_cgraph * llama_build_graph( if (batch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) && + (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) { ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); break; } @@ -12026,6 +12035,11 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } +#ifdef GGML_USE_BLAS + if (lctx.backend_blas != nullptr) { + ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); + } +#endif ggml_backend_sched_graph_compute_async(lctx.sched, gf); @@ -12248,17 +12262,6 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = std::min(4, n_threads); - } - ggml_backend_sched_alloc_graph(lctx.sched, gf); llama_set_inputs(lctx, u_batch); @@ -16251,6 +16254,16 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } #endif + +#ifdef GGML_USE_BLAS + ctx->backend_blas = ggml_backend_blas_init(); + if (ctx->backend_blas == nullptr) { + LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); + } else { + ctx->backends.push_back(ctx->backend_blas); + } +#endif + #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { for (const auto & endpoint : model->rpc_servers) { From a55eb1bf0fa2fd84147bdfd384391e029d988253 Mon Sep 17 00:00:00 2001 From: Galunid Date: Thu, 13 Jun 2024 09:42:41 +0200 Subject: [PATCH 3/4] readme : Remove outdated instructions from README.md (#7914) [no ci] --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index d1c6190ddfd7b..6c24135d61934 100644 --- a/README.md +++ b/README.md @@ -622,9 +622,6 @@ python3 -m pip install -r requirements.txt # convert the model to ggml FP16 format python3 convert-hf-to-gguf.py models/mymodel/ -# [Optional] for models using BPE tokenizers -python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe - # quantize the model to 4-bits (using Q4_K_M method) ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M From 172c8256840ffd882ab9992ecedbb587d9b21f15 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Thu, 13 Jun 2024 15:18:44 +0300 Subject: [PATCH 4/4] rpc : fix ggml_backend_rpc_supports_buft() (#7918) --- ggml-rpc.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 9b95193d3229d..22d9524b8d764 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -624,12 +624,12 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { UNUSED(backend); UNUSED(op); - GGML_ASSERT(false && "not implemented"); - return false; + //TODO: call the remote backend and cache the results + return true; } GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) { + if (buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { return false; } ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;