enable parallel prefill again

Differential Revision: D61751873 Pull Request resolved: pytorch#4893
kirklandsign · Aug 27, 2024 · 395d3f5 · 395d3f5
1 parent f92139f
commit 395d3f5
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 2 deletions.
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -126,7 +126,7 @@ Error Runner::load() {
       tokenizer_.get(),
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
-      enable_parallel_prefill_);
+      metadata_.at(kEnableDynamicShape));
 
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(),

diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -45,7 +45,6 @@ class Runner {
 
  private:
   float temperature_;
-  bool enable_parallel_prefill_;
   bool shouldStop_{false};
 
   // model