diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index fbfd89d7ad6..3b57b62176c 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -210,7 +210,7 @@ def run_native_int4(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + input_str = open(f"prompt/continuation/{in_len}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. n_ctx = in_len + out_len if in_len + out_len > 512 else 512 @@ -272,7 +272,7 @@ def run_transformer_int4(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -337,7 +337,7 @@ def run_pytorch_autocast_bf16(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -408,7 +408,7 @@ def run_optimize_model(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -597,7 +597,7 @@ def run_optimize_model_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -669,7 +669,7 @@ def run_ipex_fp16_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -747,7 +747,7 @@ def run_bigdl_fp16_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -839,7 +839,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -926,7 +926,7 @@ def run_transformer_int4_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1035,7 +1035,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1139,7 +1139,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1228,7 +1228,7 @@ def run_transformer_autocast_bf16( repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1299,7 +1299,7 @@ def run_bigdl_ipex_bf16(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1369,7 +1369,7 @@ def run_bigdl_ipex_int4(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1439,7 +1439,7 @@ def run_bigdl_ipex_int8(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1550,7 +1550,7 @@ def get_int_from_env(env_keys, default): test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1627,7 +1627,7 @@ def run_speculative_cpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1705,7 +1705,7 @@ def run_speculative_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1825,7 +1825,7 @@ def run_pipeline_parallel_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py index a5a268cfddb..0e76e798809 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py @@ -247,7 +247,7 @@ def benchmark( ) args = parser.parse_args() PROMPT_LENGTH = args.prompt_length -PROMPT = open(f"prompt/{PROMPT_LENGTH}.txt", "r").read() +PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read() MAX_TOKENS = args.max_new_tokens