Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REFINE] graphmode code #12540

Merged
merged 1 commit into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,11 @@

dtype=torch.bfloat16
num_labels = 5

model_name=model_path

save_directory = model_name + "-classification"

# Initialize the tokenizer
# Need padding from the left and padding to 1024
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(save_directory)

Expand Down
84 changes: 2 additions & 82 deletions python/llm/example/GPU/GraphMode/gpt2-graph-mode-benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import torch
import time
import argparse
import contextlib
from transformers import GPT2ForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Qwen2ForSequenceClassification
from torch.profiler import profile, record_function, ProfilerActivity, schedule

Expand All @@ -36,12 +37,6 @@
model_path = args.model_path
print(f"The batch size is: {batch_size}, device is {device}")


######################################################################################
# PyTorch Profiling with IPEX
# export IPEX_ZE_TRACING=1
# export ZE_ENABLE_TRACING_LAYER=1
import contextlib
def profiler_setup(profiling=False, *args, **kwargs):
if profiling:
return torch.profiler.profile(*args, **kwargs)
Expand All @@ -55,33 +50,25 @@ def profiler_setup(profiling=False, *args, **kwargs):
active=1
)

# also define a handler for outputing results
# define a handler for outputing results
def trace_handler(p):
if(device == 'xpu'):
print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=20))
print(p.key_averages().table(sort_by="cpu_time_total", row_limit=20))
# p.export_chrome_trace("./trace_" + str(p.step_num) + ".json")
#######################################################################################



dtype = torch.bfloat16 if device == 'cpu' else torch.float16
num_labels = 5

model_name = model_path

model_name = model_name + "-classification"
model_name_ov = model_name + "-ov"
model_name_ov = model_name_ov + "-fp16"

if (engine == 'ipex') :
import torch
import intel_extension_for_pytorch as ipex
# Need padding from the left and padding to 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype,
pad_token_id=tokenizer.eos_token_id,
low_cpu_mem_usage=True
Expand All @@ -106,122 +93,55 @@ def trace_handler(p):
tokenizer.pad_token = tokenizer.eos_token
model = OVModelForSequenceClassification.from_pretrained(model_name_ov, torch_dtype=dtype).to(device)



# Intel(R) Extension for PyTorch*
if engine == 'ipex':
if device == 'cpu':
# model = ipex.llm.optimize(model, dtype=dtype, inplace=True, deployment_mode=True)
# ############## TorchDynamo ###############
model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False)
model = torch.compile(model, backend='ipex')
# ##########################################
else: # Intel XPU
#model = ipex.llm.optimize(model, dtype=dtype, device="xpu", inplace=True)
model = ipex.optimize(model, dtype=dtype, inplace=True)

model=torch.compile(model, backend="inductor")
print(model)

# # #######calulate the total num of parameters########
# def model_size(model):
# return sum(t.numel() for t in model.parameters())
# print(f"GPT2 size: {model_size(model)/1000**2:.1f}M parameters")
# # # #######print model information ###################
# print(model)

# ########Enable the BetterTransformer ###################
# only Better Transformer only support GPT2, not support Qwen2
# model = BetterTransformer.transform(model)
#elif engine == 'ipex-llm':
# model = ipex.optimize(model, dtype=dtype, inplace=True)
# model=torch.compile(model) #backend="inductor")
elif engine == 'ov':
print("OV inference")


prompt = ["this is the first prompt"]
prompts = prompt * batch_size
#print(prompts)

# Tokenize the batch of prompts
inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=1024, truncation=True)
# print(inputs)

if engine == 'ipex' or engine == 'ipex-llm':
#ipex need move the inputs to device, but OV doesn't need
inputs.to(device)

# Initialize an empty list to store elapsed times
elapsed_times = []

# Loop for batch processing 10 times and calculate the time for every loop
with profiler_setup(profiling=enable_profile, activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
schedule=my_schedule,
on_trace_ready=trace_handler,
# on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/gpt2'),
record_shapes=True,
with_stack=True
) as prof:

for i in range(10):
start_time = time.time()

# Perform inference
with torch.inference_mode():
# logits = model(**inputs).logits
outputs = model(**inputs)
logits = outputs.logits

# Get the predicted class for each input in the batch
predicted_class_ids = logits.argmax(dim=1).tolist()

end_time = time.time()
elapsed_time = end_time - start_time

# Save the elapsed time in the list
elapsed_times.append(elapsed_time)

if(enable_profile):
prof.step()

# print(outputs)
# print(type(outputs))
# print("logits.shape is " + str(logits.shape))
# print(logits)

# print(predicted_class_ids)

elif engine == 'ov':
print("OV inference")
# Initialize an empty list to store elapsed times
elapsed_times = []

# Loop for batch processing 10 times and calculate the time for every loop
for i in range(10):
start_time = time.time()

outputs = model(**inputs)
logits = outputs.logits

# Get the predicted class for each input in the batch
predicted_class_ids = logits.argmax(dim=1).tolist()

end_time = time.time()
elapsed_time = end_time - start_time

# Save the elapsed time in the list
elapsed_times.append(elapsed_time)

# print(outputs)
# print(type(outputs))
# print("logits.shape is " + str(logits.shape))
# print(logits)

# print(predictions)
#print(predicted_class_ids)


# Skip the first two values and calculate the average of the remaining elapsed times
average_elapsed_time = sum(elapsed_times[2:]) / len(elapsed_times[2:])
classfication_per_second = batch_size/average_elapsed_time
Expand Down
Loading