Update TensorRT-LLM (NVIDIA#1274)

* Update TensorRT-LLM --------- Co-authored-by: meghagarwal <[email protected]> Co-authored-by: Shixiaowei02 <[email protected]>
aikitoria · Mar 12, 2024 · 4bb65f2 · 4bb65f2
1 parent 728cc00
commit 4bb65f2
Show file tree

Hide file tree

Showing 488 changed files with 23,178 additions and 10,463 deletions.
diff --git a/.clang-format b/.clang-format
@@ -59,6 +59,7 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Left
+QualifierAlignment: Right
 ReflowComments:  true
 SeparateDefinitionBlocks: Always
 SortIncludes:    CaseSensitive

diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,16 @@ venv/
 .local/
 .hypothesis/
 .idea/
+dump*/
+.trt-internal
+*.dot
+*.prof
+*.log
+*.pkl
+*.hdf5
+*.lock
+config.json
+/*.svg
 cpp/cmake-build-*
 cpp/.ccache/
 tensorrt_llm/libs

diff --git a/README.md b/README.md
@@ -355,6 +355,9 @@ however, that it is recommended to use the C++ version.
 
 ## Troubleshooting
 
+* If you encounter accuracy issues in the generated text, you may want to increase
+  the internal precision in the attention layer. For that, pass the `--context_fmha_fp32_acc enable` to
+  `trtllm-build`.
 
 * It's recommended to add options `–shm-size=1g –ulimit memlock=-1` to the
   docker or nvidia-docker run command.  Otherwise you may see NCCL errors when

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -39,7 +39,6 @@ Take GPT-350M as an example for single GPU
 
 ```
 ./benchmarks/gptSessionBenchmark \
-    --model gpt_350m \
     --engine_dir "../../benchmarks/gpt_350m/" \
     --batch_size "1" \
     --input_output_len "60,20"
@@ -50,7 +49,6 @@ Take GPT-350M as an example for single GPU
 Take GPT-175B as an example for multiple GPUs
 ```
 mpirun -n 8 ./benchmarks/gptSessionBenchmark \
-    --model gpt_175b \
     --engine_dir "../../benchmarks/gpt_175b/" \
     --batch_size "1" \
     --input_output_len "60,20"
@@ -125,7 +123,6 @@ cd cpp/build
 Take GPT-350M as an example for single GPU V1 batching
 ```
 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type V1 \
     --dataset ../../benchmarks/cpp/preprocessed_dataset.json
@@ -135,7 +132,6 @@ Take GPT-350M as an example for single GPU V1 batching
 Take GPT-350M as an example for 2-GPU inflight batching
 ```
 mpirun -n 2 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2-ib/fp16/2-gpu/ \
     --type IFB \
     --dataset ../../benchmarks/cpp/preprocessed_dataset.json
@@ -165,7 +161,6 @@ Given a `static_emulated_batch_size` of `n` the server will wait for `n` request
 Take GPT-350M as an example for single GPU with static batching
 ```
 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type IFB \
     --static_emulated_batch_size 32 \

diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp
@@ -237,7 +237,7 @@ int main(int argc, char* argv[])
         benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
             logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
     }
-    catch (const std::exception& e)
+    catch (std::exception const& e)
     {
         TLLM_LOG_ERROR(e.what());
         return 1;