Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] add transformer_int4_fp16_loadlowbit_gpu_win api #11511

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/llm/dev/benchmark/all-in-one/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ test_api:
# - "transformer_int4_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp32)
# - "transformer_int4_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp32)
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
# - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp16), use load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
# - "bigdl_fp16_gpu" # on Intel GPU, use ipex-llm transformers API, (dtype=fp16), (qtype=fp16)
# - "optimize_model_gpu" # on Intel GPU, can optimize any pytorch models include transformer model
# - "deepspeed_optimize_model_gpu" # on Intel GPU, deepspeed autotp inference
Expand Down
107 changes: 107 additions & 0 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
# drop the results of the first time for better performance
run_transformer_int4_loadlowbit_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
result = run_transformer_int4_loadlowbit_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
elif test_api == 'transformer_int4_fp16_loadlowbit_gpu_win':
# drop the results of the first time for better performance
run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
result = run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
elif test_api == 'transformer_autocast_bf16':
result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
elif test_api == 'bigdl_ipex_bf16':
Expand Down Expand Up @@ -1191,6 +1195,109 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
return result


def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
local_model_hub,
in_out_pairs,
warm_up,
num_trials,
num_beams,
low_bit,
cpu_embedding,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load BigDL-LLM optimized low bit model
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
model = AutoModel.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
torch_dtype=torch.bfloat16, use_cache=True, cpu_embedding=cpu_embedding).eval()
Oscilloscope98 marked this conversation as resolved.
Show resolved Hide resolved
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
torch_dtype=torch.bfloat16, use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = LlamaTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
elif repo_id in LLAVA_IDS:
llava_repo_dir = os.environ.get('LLAVA_REPO_DIR')
sys.path.append(rf"{llava_repo_dir}")
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
torch_dtype=torch.bfloat16, use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
else:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
torch_dtype=torch.bfloat16, use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use model = model.half().to('xpu'), and remove torch_dtype=torch.float16 for run_transformer_int4_fp16_loadlowbit_gpu_win for now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

end = time.perf_counter()
load_time = end - st
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))

model = BenchmarkWrapper(model)
streamer = TextStreamer(tokenizer, skip_prompt=True)

result = {}
with torch.inference_mode():
for in_out in in_out_pairs:
try:
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
# As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
st = time.perf_counter()
if streaming:
output_ids = model.generate(input_ids, do_sample=False,
max_new_tokens=out_len, min_new_tokens=out_len,
num_beams=num_beams, streamer=streamer)
else:
output_ids = model.generate(input_ids, do_sample=False,
max_new_tokens=out_len, min_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize()
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
if not streaming:
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len, load_time, model.peak_memory])
# torch.xpu.empty_cache() # this may make first token slower
except RuntimeError:
traceback.print_exc()
pass
torch.xpu.synchronize()
torch.xpu.empty_cache()
model.to('cpu')
torch.xpu.synchronize()
torch.xpu.empty_cache()
del model
gc.collect()
return result


def run_transformer_autocast_bf16( repo_id,
local_model_hub,
in_out_pairs,
Expand Down
Loading