[Backend] Bump TRTLLM to v.0.17.0 (#2991)

* backend(trtllm): bump TRTLLM to v.0.17.0 * backend(trtllm): forget to bump dockerfile * backend(trtllm): use arg instead of env * backend(trtllm): use correct library reference decoder_attention_src * backend(trtllm): link against decoder_attention_{0|1} * backend(trtllm): build against gcc-14 with cuda12.8 * backend(trtllm): use return value optimization flag as as error if available * backend(trtllm): make sure we escalade all warnings as errors on the backend impl in debug mode * backend(trtllm): link against CUDA 12.8
huggingface · Feb 6, 2025 · 856709d · 856709d
1 parent 36223f8
commit 856709d
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 19 deletions.
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
@@ -1,12 +1,14 @@
-ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real"
+ARG cuda_base=12.8.0
 ARG build_type=release
 ARG ompi_version=4.1.7
 ARG sccache_gha_enabled=off
 ARG actions_cache_url=""
 ARG actions_runtime_token=""
 
+
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
+FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     build-essential \
@@ -98,14 +100,16 @@ COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 
 ENV RUSTC_WRAPPER=sccache
 ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
-RUN export CMAKE_C_COMPILER_LAUNCHER=sccache && \
+RUN export CC=gcc-14 \
+    export CXX=g++-14 \
+    export CMAKE_C_COMPILER_LAUNCHER=sccache && \
     export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
     export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
     mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
     cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
     sccache --show-stats
 
-FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime
 RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
     pipx ensurepath && \
@@ -124,7 +128,7 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
 COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
 
 # This is used only for the CI/CD
-FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime
 RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
     pipx ensurepath && \

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
@@ -59,7 +59,9 @@ target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugi
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
 install(TARGETS tgi_trtllm_backend_impl)
-install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+#install(TARGETS cutlass_src fb_gemm_src fpA_intB_gemm_src gemm_swiglu_sm90_src kernels_src)
+install(TARGETS decoder_attention_0 decoder_attention_1)
+install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention_src executorWorker)
 install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
 if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
     install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
@@ -82,8 +84,9 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
     check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
     if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
         message(STATUS "Enabling non-NVRO detection")
-        target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
+        target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wnrvo)
     endif ()
+    target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wall)
 
     cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
     message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
@@ -7,7 +7,7 @@ use std::sync::LazyLock;
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.6";
+const CUDA_REQUIRED_VERSION: &str = "12.8";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@@ -25,11 +25,12 @@ const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
 // Dependencies
 const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
 const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
-const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
     ("dylib", "tensorrt_llm"),
     ("dylib", "tensorrt_llm_nvrtc_wrapper"),
     ("dylib", "nvinfer_plugin_tensorrt_llm"),
-    ("dylib", "decoder_attention"),
+    ("dylib", "decoder_attention_0"),
+    ("dylib", "decoder_attention_1"),
 ];
 
 macro_rules! probe {

diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
@@ -28,7 +28,7 @@ find_package(Python3 REQUIRED Interpreter)
 fetchcontent_declare(
         trtllm
         GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git
-        GIT_TAG v0.16.0
+        GIT_TAG v0.17.0
         GIT_SHALLOW ON
         DOWNLOAD_EXTRACT_TIMESTAMP
 )

diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
@@ -2,13 +2,13 @@
 
 set -ex
 
-TRT_VER_BASE="10.7.0"
-TRT_VER_FULL="${TRT_VER_BASE}.23"
-CUDA_VER="12.6"
-CUDNN_VER="9.5.0.50-1"
-NCCL_VER="2.22.3-1+cuda12.6"
-CUBLAS_VER="12.6.3.3-1"
-NVRTC_VER="12.6.77-1"
+TRT_VER_BASE="10.8.0"
+TRT_VER_FULL="${TRT_VER_BASE}.43"
+CUDA_VER="12.8"
+CUDNN_VER="9.7.0.66-1"
+NCCL_VER="2.25.1-1+cuda${CUDA_VER}"
+CUBLAS_VER="${CUDA_VER}.3.14-1"
+NVRTC_VER="${CUDA_VER}.61-1"
 
 for i in "$@"; do
     case $i in
@@ -73,7 +73,7 @@ install_centos_requirements() {
 install_tensorrt() {
     #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
     #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
-    TRT_CUDA_VERSION="12.6"
+    TRT_CUDA_VERSION="12.8"
 
     if [ -z "$RELEASE_URL_TRT" ];then
         ARCH=${TRT_TARGETARCH}