update

huggingface · Jul 30, 2024 · 1b7ddb1 · 1b7ddb1
1 parent 1aa04f0
commit 1b7ddb1
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 18 deletions.
diff --git a/.github/workflows/test_cli_llama_cpp.yaml → ...hub/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_llama_cpp.yaml → ...hub/workflows/test_cli_cpu_llama_cpp.yaml
@@ -1,4 +1,4 @@
-name: CLI Llama.cpp Tests
+name: CLI CPU Llama.Cpp Tests
 
 on:
   workflow_dispatch:
@@ -26,15 +26,15 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 jobs:
-  run_cli_llama_cpp_tests:
+  run_cli_cpu_llama_cpp_tests:
     runs-on: ubuntu-latest
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python 3.10
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
         with:
           python-version: "3.10"
 

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
 
 *News* 📰
 
+- LlamaCpp backend for benchmarking [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) bindings with all its supported devices 🚀
 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
 - Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
 - numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
@@ -47,6 +48,7 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices
 
 ### CLI 📈
 
+[![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml)
 [![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml)
 [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml)
 [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml)

diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py
@@ -1,5 +1,6 @@
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, Tuple
+import subprocess
 
 from llama_cpp import Llama
 
@@ -28,20 +29,25 @@ def load_model_from_pretrained(self) -> None:
         """
         Load the pretrained model from the given model name (normally GGUF, GGML)
         """
-        embedding = True if self.config.task == "feature-extraction" else False
 
         self.pretrained_model = Llama.from_pretrained(
-            repo_id=self.config.model,  # type: ignore
+            repo_id=self.config.model,
             filename=self.config.filename,
-            verbose=False,
-            echo=False,
-            embedding=embedding,
-        )  # type: ignore
+            **self.llama_cpp_kwargs,
+        )
 
     def validate_task(self) -> None:
         if self.config.task not in ["text-generation"]:
             raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
 
+    @property
+    def llama_cpp_kwargs(self) -> Dict[str, Any]:
+        return {
+            "embedding": self.config.task == "feature-extraction",
+            "verbose": False,
+            "echo": False,
+        }
+
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         if self.config.task == "text-generation":
             if inputs["input_ids"].shape[0] != 1:

diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py
@@ -5,12 +5,6 @@
 from ...import_utils import llama_cpp_version
 from ..config import BackendConfig
 
-LOGGER = getLogger("backend")
-
-
-def llama_cpp_model_kwargs():
-    return {"verbose": True}
-
 
 @dataclass
 class LlamaCppConfig(BackendConfig):
@@ -30,5 +24,3 @@ def __post_init__(self):
 
         if self.device not in ["cuda", "mps", "cpu"]:
             raise ValueError(f"Llama.cpp Backend only supports 'cpu', 'mps' and 'cuda' devices, got {self.device}")
-
-        LOGGER.warning("Llama.cpp automatically selects the device, ignoring the device parameter in the config.")