Update TensorRT-LLM (NVIDIA#1358)

Co-authored-by: Kaiyu <[email protected]>
aikitoria · Mar 26, 2024 · 850b6fa · 850b6fa
1 parent 66ca337
commit 850b6fa
Show file tree

Hide file tree

Showing 328 changed files with 436,786 additions and 6,623 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,7 +34,7 @@
   - Optimize AllReduce for parallel attention on Falcon and GPT-J
   - Enable split-k for weight-only cutlass kernel when SM>=75
 * Documentation
-  - Add [documentation for new builder workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/new_workflow.md)
+  - Add [documentation for convert/build workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/checkpoint.md)
 
 ## Versions 0.6.0 / 0.6.1
 

diff --git a/README.md b/README.md
@@ -282,6 +282,7 @@ The list of supported models is:
 * [InternLM](examples/internlm)
 * [LLaMA](examples/llama)
 * [LLaMA-v2](examples/llama)
+* [Mamba](examples/mamba)
 * [mBART](examples/enc_dec)
 * [Mistral](examples/llama#mistral-v01)
 * [MPT](examples/mpt)
@@ -454,7 +455,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
   - Support FP16 fMHA on NVIDIA V100 GPU
 * API
   - Add a set of High-level APIs for end-to-end generation tasks (see examples/high-level-api/README.md)
-  - **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/new_workflow.md)
+  - **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/checkpoint.md)
   - **[BREAKING CHANGES]** Deprecate `LayerNorm` and `RMSNorm` plugins and removed corresponding build parameters
   - **[BREAKING CHANGES]** Remove optional parameter `maxNumSequences` for GPT manager
 * Bug fixes
@@ -482,7 +483,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
   - Batch manager arguments documentation updates
   - Add documentation for best practices for tuning the performance of TensorRT-LLM (See docs/source/perf_best_practices.md)
   - Add documentation for Falcon AWQ support (See examples/falcon/README.md)
-  - Update to the `docs/source/new_workflow.md` documentation
+  - Update to the `docs/source/checkpoint.md` documentation
   - Update AWQ INT4 weight only quantization documentation for GPT-J
   - Add blog: Speed up inference with SOTA quantization techniques in TRT-LLM
   - Refine TensorRT-LLM backend README structure #133

diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
@@ -19,8 +19,10 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
 
 add_custom_target(benchmarks)
 
-set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
-add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+if(NOT TARGET cxxopts::cxxopts)
+  set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
+  add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+endif()
 
 function(add_benchmark test_name test_src)
   add_executable(${test_name} ${test_src})

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -127,6 +127,7 @@ python prepare_dataset.py \
 
 For `tokenizer`, specifying the path to the local tokenizer that have already been downloaded, or simply the name of the tokenizer from HuggingFace like `meta-llama/Llama-2-7b` will both work. The tokenizer will be downloaded automatically for the latter case.
 
+
 #### Prepare TensorRT-LLM engines
 Please make sure that the engines are built with argument `--use_inflight_batching` and `--remove_input_padding` if you'd like to benchmark inflight batching, for more details, please see the document in TensorRT-LLM examples.
 
@@ -187,3 +188,130 @@ Take GPT-350M as an example for single GPU with static batching
     --static_emulated_timeout 100 \
     --dataset ../../benchmarks/cpp/tokens-fixed-lengths.json
 ```
+
+#### Benchmarking LoRA
+
+Using either of the `prepare_dataset.py` methods above, add `--rand-task-id <start-id> <end-id>` to the command. This will add a random `task_id` from `<start-id>` to `<end-id>` inclusive.
+You can then use `utils/generate_rand_loras.py` to generate random LoRA weights for benchmarking purposes. `utils/generate_rand_loras.py` takes an example LoRA for the model you are benchmarking.
+Then you can run `gptManagerBenchmark` with `--type IFB` and `--lora_dir /path/to/utils/generate_rand_loras/output`
+
+End-to-end LoRA benchmarking script
+
+```
+git-lfs clone https://huggingface.co/meta-llama/Llama-2-13b-hf
+git-lfs clone https://huggingface.co/hfl/chinese-llama-2-lora-13b
+
+MODEL_CHECKPOINT=Llama-2-13b-hf
+CONVERTED_CHECKPOINT=Llama-2-13b-hf-ckpt
+TOKENIZER=Llama-2-13b-hf
+LORA_ENGINE=Llama-2-13b-hf-engine
+
+DTYPE=float16
+TP=2
+PP=1
+MAX_LEN=1024
+MAX_BATCH=32
+MAX_LORA_RANK=32
+
+SOURCE_LORA=chinese-llama-2-lora-13b
+CPP_LORA=chinese-llama-2-lora-13b-cpp
+
+EG_DIR=/tmp/lora-eg
+
+# Build lora enabled engine
+python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
+                              --output_dir ${CONVERTED_CHECKPOINT} \
+                              --dtype ${DTYPE} \
+                              --tp_size ${TP} \
+                              --pp_size 1 \
+                              --lora_target_modules attn_qkv \
+                              --max_lora_rank ${MAX_LORA_RANK}
+
+${HOME}/.local/bin/trtllm-build \
+    --checkpoint_dir ${CONVERTED_CHECKPOINT} \
+    --output_dir ${LORA_ENGINE} \
+    --max_batch_size ${MAX_BATCH} \
+    --max_input_len $MAX_LEN \
+    --max_output_len $MAX_LEN \
+    --gpt_attention_plugin float16 \
+    --paged_kv_cache enable \
+    --remove_input_padding enable \
+    --gemm_plugin float16 \
+    --lora_plugin float16 \
+    --use_paged_context_fmha enable \
+    --use_custom_all_reduce disable
+
+NUM_LORAS=(8 16 24 32 64 128 256)
+NUM_REQUESTS=1024
+
+# Convert LoRA to cpp format
+python examples/gpt/nemo_lora_convert.py \
+    -i $SOURCE_LORA \
+    --storage-type $DTYPE \
+    --write-cpp-runtime-tensors \
+    -o $CPP_LORA
+
+# Prepare datasets
+mkdir -p $EG_DIR/data
+
+# Prepare dataset without lora_task_id
+python benchmarks/cpp/prepare_dataset.py \
+    --output "${EG_DIR}/data/token-norm-dist.json" \
+    --request-rate -1 \
+    --time-delay-dist constant \
+    --tokenizer $TOKENIZER \
+    token-norm-dist \
+    --num-requests $NUM_REQUESTS \
+    --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
+
+# Prepare dataset with lora_task_ids from 0 - $nloras
+for nloras in ${NUM_LORAS[@]}; do
+    python benchmarks/cpp/prepare_dataset.py \
+        --output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
+        --request-rate -1 \
+        --time-delay-dist constant \
+        --rand-task-id 0 $(( $nloras - 1 )) \
+        --tokenizer $TOKENIZER \
+        token-norm-dist \
+        --num-requests $NUM_REQUESTS \
+        --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
+done
+
+# Generate random lora weights for 256 adapters
+python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 256
+
+# perform benchmarking
+
+# First run inference without LoRAs
+mkdir -p ${EG_DIR}/log-base-lora
+mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
+    cpp/build_Debug/benchmarks/gptManagerBenchmark \
+    --engine_dir $LORA_ENGINE \
+    --type IFB \
+    --dataset "${EG_DIR}/data/token-norm-dist.json" \
+    --lora_host_cache_bytes 8589934592 \
+    --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+    --kv_cache_free_gpu_mem_fraction 0.80 \
+    --log_level info \
+    --eos_id ${EOS_ID}
+
+# Now run inference with various numbers or loras
+# The host cache is set large enough to hold all the LoRAs in lora_dir
+# GPU cache is set to hold 32 LoRAs
+# This benchmark will preload all the LoRAs into the host cache
+# We run inference on a range of active LoRAs exercising different cache miss rates.
+for nloras in ${NUM_LORAS[@]}; do
+    mkdir -p ${EG_DIR}/log-lora-${nloras}
+    mpirun -n ${TP} --output-filename "${EG_DIR}/log-lora-${nloras}" \
+        cpp/build_Debug/benchmarks/gptManagerBenchmark \
+        --engine_dir $LORA_ENGINE \
+        --type IFB \
+        --dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
+        --lora_host_cache_bytes 8589934592 \
+        --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+        --kv_cache_free_gpu_mem_fraction 0.80 \
+        --log_level info \
+        --eos_id ${EOS_ID} \
+        --lora_dir ${EG_DIR}/loras
+done
+```