From 85df5e76991d13430fdaaf8487401deb27757db8 Mon Sep 17 00:00:00 2001
From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com>
Date: Fri, 7 Jun 2024 09:33:14 +0800
Subject: [PATCH] fix nightly perf test (#11251)

---
 python/llm/dev/benchmark/all-in-one/run.py    | 38 +++++++++----------
 .../GPU/Deepspeed-AutoTP-FastAPI/benchmark.py |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index fbfd89d7ad6..3b57b62176c 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -210,7 +210,7 @@ def run_native_int4(repo_id,
         in_out_len = in_out.split("-")
         in_len = int(in_out_len[0])
         out_len = int(in_out_len[1])
-        input_str = open(f"prompt/{in_len}.txt", 'r').read()
+        input_str = open(f"prompt/continuation/{in_len}.txt", 'r').read()
         # As different tokenizer has different encodings,
         # slice the input_ids to ensure the prompt length is required length.
         n_ctx = in_len + out_len if in_len + out_len > 512 else 512
@@ -272,7 +272,7 @@ def run_transformer_int4(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -337,7 +337,7 @@ def run_pytorch_autocast_bf16(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -408,7 +408,7 @@ def run_optimize_model(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -597,7 +597,7 @@ def run_optimize_model_gpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -669,7 +669,7 @@ def run_ipex_fp16_gpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -747,7 +747,7 @@ def run_bigdl_fp16_gpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -839,7 +839,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -926,7 +926,7 @@ def run_transformer_int4_gpu_win(repo_id,
                 test_length = min(in_len*2, 8192)
                 while test_length not in [32, 256, 1024, 2048, 8192]:
                     test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1035,7 +1035,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                 test_length = min(in_len*2, 8192)
                 while test_length not in [32, 256, 1024, 2048, 8192]:
                     test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1139,7 +1139,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
                 test_length = min(in_len*2, 8192)
                 while test_length not in [32, 256, 1024, 2048, 8192]:
                     test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1228,7 +1228,7 @@ def run_transformer_autocast_bf16( repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1299,7 +1299,7 @@ def run_bigdl_ipex_bf16(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1369,7 +1369,7 @@ def run_bigdl_ipex_int4(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1439,7 +1439,7 @@ def run_bigdl_ipex_int8(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1550,7 +1550,7 @@ def get_int_from_env(env_keys, default):
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1627,7 +1627,7 @@ def run_speculative_cpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1705,7 +1705,7 @@ def run_speculative_gpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1825,7 +1825,7 @@ def run_pipeline_parallel_gpu(repo_id,
             test_length = min(in_len*2, 8192)
             while test_length not in [32, 256, 1024, 2048, 8192]:
                 test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
index a5a268cfddb..0e76e798809 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
@@ -247,7 +247,7 @@ def benchmark(
 )
 args = parser.parse_args()
 PROMPT_LENGTH = args.prompt_length
-PROMPT = open(f"prompt/{PROMPT_LENGTH}.txt", "r").read()
+PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read()
 MAX_TOKENS = args.max_new_tokens