huggingface · angt · Jan 24, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "backends/v3",
     "backends/grpc-metadata",
     "backends/trtllm",
+    "backends/llamacpp",
     "launcher",
     "router"
 ]

diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
@@ -0,0 +1,75 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4651
+ARG llamacpp_cuda=OFF
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+ENV TGI_LLAMA_PKG_CUDA=cuda-${CUDA_VERSION%.*}
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN tar -xzf ${llamacpp_version}.tar.gz \
+ && cd llama.cpp-${llamacpp_version} \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release-opt \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release-opt \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+
+RUN apt update && apt install -y \
+    python3-venv \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
diff --git a/backends/llamacpp/.cargo/config.toml b/backends/llamacpp/.cargo/config.toml
@@ -0,0 +1,2 @@
+[build]
+rustflags = ["-C", "target-cpu=native"]
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "text-generation-router-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[build-dependencies]
+bindgen = "0.71.1"
+pkg-config = "0.3.31"
+
+[dependencies]
+async-trait = "0.1.85"
+clap = "4.5.27"
+num_cpus = "1.16.0"
+text-generation-router = { path = "../../router" }
+thiserror = "2.0.11"
+tokenizers.workspace = true
+tokio = "1.43.0"
+tokio-stream = "0.1.17"
+tracing = "0.1.41"
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
@@ -0,0 +1,57 @@
+use bindgen::callbacks::{ItemInfo, ParseCallbacks};
+use std::collections::HashMap;
+use std::env;
+use std::path::PathBuf;
+
+fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardware: &str) {
+    let hardware_targets = HashMap::from([("cpu", None), ("cuda", Some(vec!["cuda"]))]);
+
+    if let Some(lib_search_path) = lib_search_path {
+        lib_search_path.split(":").for_each(|path| {
+            println!("cargo:rustc-link-search=dependency={path}");
+        });
+    }
+
+    if let Some(hardware_transient_deps) = hardware_targets.get(lib_target_hardware) {
+        if let Some(additional_transient_deps) = hardware_transient_deps {
+            additional_transient_deps.iter().for_each(|dep| {
+                println!("cargo:rustc-link-lib={dep}");
+            });
+        }
+    }
+}
+
+#[derive(Debug)]
+struct PrefixStripper;
+
+impl ParseCallbacks for PrefixStripper {
+    fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
+        item_info.name.strip_prefix("llama_").map(str::to_string)
+    }
+}
+
+fn main() {
+    let pkg_cuda = option_env!("TGI_LLAMA_PKG_CUDA");
+    let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH");
+    let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu");
+
+    let bindings = bindgen::Builder::default()
+        .header("src/wrapper.h")
+        .prepend_enum_name(false)
+        .parse_callbacks(Box::new(PrefixStripper))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .generate()
+        .expect("Unable to generate bindings");
+
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("llamacpp.rs"))
+        .expect("Couldn't write bindings!");
+
+    if let Some(pkg_cuda) = pkg_cuda {
+        pkg_config::Config::new().probe(pkg_cuda).unwrap();
+    }
+    pkg_config::Config::new().probe("llama").unwrap();
+
+    inject_transient_dependencies(lib_search_path, lib_target_hardware);
+}
diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.48.2
+huggingface-hub==0.28.1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[build]
		rustflags = ["-C", "target-cpu=native"]