From 0b953e61ef4600bdd9d80ddd784140cc883e73a7 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 16 Dec 2024 09:17:01 +0800 Subject: [PATCH] [REFINE] graphmode code (#12540) --- .../convert-model-textgen-to-classfication.py | 6 +- .../GraphMode/gpt2-graph-mode-benchmark.py | 84 +------------------ 2 files changed, 3 insertions(+), 87 deletions(-) diff --git a/python/llm/example/GPU/GraphMode/convert-model-textgen-to-classfication.py b/python/llm/example/GPU/GraphMode/convert-model-textgen-to-classfication.py index d94364dacb2..6cf5720fb2e 100644 --- a/python/llm/example/GPU/GraphMode/convert-model-textgen-to-classfication.py +++ b/python/llm/example/GPU/GraphMode/convert-model-textgen-to-classfication.py @@ -25,15 +25,11 @@ dtype=torch.bfloat16 num_labels = 5 - model_name=model_path - save_directory = model_name + "-classification" -# Initialize the tokenizer -# Need padding from the left and padding to 1024 +# Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) -# tokenizer.padding_side = "left" tokenizer.pad_token = tokenizer.eos_token tokenizer.save_pretrained(save_directory) diff --git a/python/llm/example/GPU/GraphMode/gpt2-graph-mode-benchmark.py b/python/llm/example/GPU/GraphMode/gpt2-graph-mode-benchmark.py index dd903bea8b6..697b64b1c44 100644 --- a/python/llm/example/GPU/GraphMode/gpt2-graph-mode-benchmark.py +++ b/python/llm/example/GPU/GraphMode/gpt2-graph-mode-benchmark.py @@ -17,6 +17,7 @@ import torch import time import argparse +import contextlib from transformers import GPT2ForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Qwen2ForSequenceClassification from torch.profiler import profile, record_function, ProfilerActivity, schedule @@ -36,12 +37,6 @@ model_path = args.model_path print(f"The batch size is: {batch_size}, device is {device}") - -###################################################################################### -# PyTorch Profiling with IPEX -# export IPEX_ZE_TRACING=1 -# export ZE_ENABLE_TRACING_LAYER=1 -import contextlib def profiler_setup(profiling=False, *args, **kwargs): if profiling: return torch.profiler.profile(*args, **kwargs) @@ -55,21 +50,15 @@ def profiler_setup(profiling=False, *args, **kwargs): active=1 ) -# also define a handler for outputing results +# define a handler for outputing results def trace_handler(p): if(device == 'xpu'): print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=20)) print(p.key_averages().table(sort_by="cpu_time_total", row_limit=20)) - # p.export_chrome_trace("./trace_" + str(p.step_num) + ".json") -####################################################################################### - - dtype = torch.bfloat16 if device == 'cpu' else torch.float16 num_labels = 5 - model_name = model_path - model_name = model_name + "-classification" model_name_ov = model_name + "-ov" model_name_ov = model_name_ov + "-fp16" @@ -77,11 +66,9 @@ def trace_handler(p): if (engine == 'ipex') : import torch import intel_extension_for_pytorch as ipex - # Need padding from the left and padding to 1024 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.padding_side = "left" tokenizer.pad_token = tokenizer.eos_token - model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype, pad_token_id=tokenizer.eos_token_id, low_cpu_mem_usage=True @@ -106,122 +93,55 @@ def trace_handler(p): tokenizer.pad_token = tokenizer.eos_token model = OVModelForSequenceClassification.from_pretrained(model_name_ov, torch_dtype=dtype).to(device) - - # Intel(R) Extension for PyTorch* if engine == 'ipex': if device == 'cpu': - # model = ipex.llm.optimize(model, dtype=dtype, inplace=True, deployment_mode=True) - # ############## TorchDynamo ############### model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False) model = torch.compile(model, backend='ipex') - # ########################################## else: # Intel XPU - #model = ipex.llm.optimize(model, dtype=dtype, device="xpu", inplace=True) model = ipex.optimize(model, dtype=dtype, inplace=True) - model=torch.compile(model, backend="inductor") print(model) - - # # #######calulate the total num of parameters######## - # def model_size(model): - # return sum(t.numel() for t in model.parameters()) - # print(f"GPT2 size: {model_size(model)/1000**2:.1f}M parameters") - # # # #######print model information ################### - # print(model) - - # ########Enable the BetterTransformer ################### - # only Better Transformer only support GPT2, not support Qwen2 - # model = BetterTransformer.transform(model) -#elif engine == 'ipex-llm': -# model = ipex.optimize(model, dtype=dtype, inplace=True) -# model=torch.compile(model) #backend="inductor") elif engine == 'ov': print("OV inference") prompt = ["this is the first prompt"] prompts = prompt * batch_size -#print(prompts) - -# Tokenize the batch of prompts inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=1024, truncation=True) -# print(inputs) if engine == 'ipex' or engine == 'ipex-llm': - #ipex need move the inputs to device, but OV doesn't need inputs.to(device) - - # Initialize an empty list to store elapsed times elapsed_times = [] - - # Loop for batch processing 10 times and calculate the time for every loop with profiler_setup(profiling=enable_profile, activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], schedule=my_schedule, on_trace_ready=trace_handler, - # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/gpt2'), record_shapes=True, with_stack=True ) as prof: - for i in range(10): start_time = time.time() - - # Perform inference with torch.inference_mode(): - # logits = model(**inputs).logits outputs = model(**inputs) logits = outputs.logits - - # Get the predicted class for each input in the batch predicted_class_ids = logits.argmax(dim=1).tolist() - end_time = time.time() elapsed_time = end_time - start_time - - # Save the elapsed time in the list elapsed_times.append(elapsed_time) - if(enable_profile): prof.step() - - # print(outputs) - # print(type(outputs)) - # print("logits.shape is " + str(logits.shape)) - # print(logits) - - # print(predicted_class_ids) - elif engine == 'ov': print("OV inference") - # Initialize an empty list to store elapsed times elapsed_times = [] - - # Loop for batch processing 10 times and calculate the time for every loop for i in range(10): start_time = time.time() - outputs = model(**inputs) logits = outputs.logits - - # Get the predicted class for each input in the batch predicted_class_ids = logits.argmax(dim=1).tolist() - end_time = time.time() elapsed_time = end_time - start_time - - # Save the elapsed time in the list elapsed_times.append(elapsed_time) - # print(outputs) - # print(type(outputs)) - # print("logits.shape is " + str(logits.shape)) - # print(logits) - - # print(predictions) - #print(predicted_class_ids) - - # Skip the first two values and calculate the average of the remaining elapsed times average_elapsed_time = sum(elapsed_times[2:]) / len(elapsed_times[2:]) classfication_per_second = batch_size/average_elapsed_time