diff --git a/.github/workflows/test_cli_llama_cpp.yaml b/.github/workflows/test_cli_cpu_llama_cpp.yaml similarity index 88% rename from .github/workflows/test_cli_llama_cpp.yaml rename to .github/workflows/test_cli_cpu_llama_cpp.yaml index 8e3e583d..4a7272ad 100644 --- a/.github/workflows/test_cli_llama_cpp.yaml +++ b/.github/workflows/test_cli_cpu_llama_cpp.yaml @@ -1,4 +1,4 @@ -name: CLI Llama.cpp Tests +name: CLI CPU Llama.Cpp Tests on: workflow_dispatch: @@ -26,15 +26,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} jobs: - run_cli_llama_cpp_tests: + run_cli_cpu_llama_cpp_tests: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/README.md b/README.md index e6be23a2..e2f824d3 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices *News* 📰 +- LlamaCpp backend for benchmarking [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) bindings with all its supported devices 🚀 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) ! - Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀 - numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs. @@ -47,6 +48,7 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices ### CLI 📈 +[![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml) [![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml) [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml) [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml) diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py index 60be9066..2955b423 100644 --- a/optimum_benchmark/backends/llama_cpp/backend.py +++ b/optimum_benchmark/backends/llama_cpp/backend.py @@ -1,5 +1,6 @@ from tempfile import TemporaryDirectory from typing import Any, Dict, Tuple +import subprocess from llama_cpp import Llama @@ -28,20 +29,25 @@ def load_model_from_pretrained(self) -> None: """ Load the pretrained model from the given model name (normally GGUF, GGML) """ - embedding = True if self.config.task == "feature-extraction" else False self.pretrained_model = Llama.from_pretrained( - repo_id=self.config.model, # type: ignore + repo_id=self.config.model, filename=self.config.filename, - verbose=False, - echo=False, - embedding=embedding, - ) # type: ignore + **self.llama_cpp_kwargs, + ) def validate_task(self) -> None: if self.config.task not in ["text-generation"]: raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") + @property + def llama_cpp_kwargs(self) -> Dict[str, Any]: + return { + "embedding": self.config.task == "feature-extraction", + "verbose": False, + "echo": False, + } + def prepare_inputs(self, inputs: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: if self.config.task == "text-generation": if inputs["input_ids"].shape[0] != 1: diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py index 794ea9d3..5b98da27 100644 --- a/optimum_benchmark/backends/llama_cpp/config.py +++ b/optimum_benchmark/backends/llama_cpp/config.py @@ -5,12 +5,6 @@ from ...import_utils import llama_cpp_version from ..config import BackendConfig -LOGGER = getLogger("backend") - - -def llama_cpp_model_kwargs(): - return {"verbose": True} - @dataclass class LlamaCppConfig(BackendConfig): @@ -30,5 +24,3 @@ def __post_init__(self): if self.device not in ["cuda", "mps", "cpu"]: raise ValueError(f"Llama.cpp Backend only supports 'cpu', 'mps' and 'cuda' devices, got {self.device}") - - LOGGER.warning("Llama.cpp automatically selects the device, ignoring the device parameter in the config.")