Skip to content

Commit

Permalink
Remove all ipex usage (#12666)
Browse files Browse the repository at this point in the history
  • Loading branch information
MeouSker77 authored Jan 8, 2025
1 parent 0534d72 commit ccf618f
Show file tree
Hide file tree
Showing 9 changed files with 39 additions and 553 deletions.
9 changes: 3 additions & 6 deletions python/llm/dev/benchmark/all-in-one/run-stress-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def run_transformer_int4_gpu(repo_id,
num_beams,
low_bit):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
import intel_extension_for_pytorch as ipex
reserved_mem_list = []
model_path = get_model_path(repo_id, local_model_hub)
Expand All @@ -170,9 +170,6 @@ def run_transformer_int4_gpu(repo_id,
trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
if isinstance(model, GPTJForCausalLM):
# For gpt-j model family, this optimization can provide a better performance.
model = ipex.optimize(model.eval(), inplace=True)
end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st))
reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
Expand Down Expand Up @@ -227,7 +224,7 @@ def run_transformer_int4_gpu(repo_id,
today = date.today()
if 'exclude' in conf:
excludes = conf['exclude']

import pandas as pd
for api in conf.test_api:
for model in conf.repo_id:
Expand All @@ -240,7 +237,7 @@ def run_transformer_int4_gpu(repo_id,
run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
conf['low_bit'], conf['cpu_embedding'])
df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'peak mem (GB)'])

df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
Expand Down
42 changes: 21 additions & 21 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def preprocess_prompt(tokenizer, in_len, task):
elif in_len == 4096:
input_str = open(f"prompt/QA/orca_497.txt", 'r', encoding='utf-8').read()
else:
raise ValueError("No corresponding prompt available now, will be added later.")
input_ids = tokenizer.encode(input_str, return_tensors="pt")
raise ValueError("No corresponding prompt available now, will be added later.")
input_ids = tokenizer.encode(input_str, return_tensors="pt")
return input_ids

def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1, streaming=False, use_fp16_torch_dtype=False, lookahead=False, task='continuation', optimize_model=False, transpose_value_cache=True, group_size=64):
Expand Down Expand Up @@ -222,7 +222,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
streaming if 'win' in test_api else 'N/A',
use_fp16_torch_dtype if 'pipeline_parallel_gpu' in test_api else 'N/A',
group_size if any(keyword in test_api for keyword in ['transformers_int4_npu_win', 'transformers_int4_npu_pipeline_win']) else 'N/A'],
)
)


def get_model_path(repo_id, local_model_hub):
Expand Down Expand Up @@ -475,7 +475,7 @@ def run_transformer_int4_gpu(repo_id,
lookahead=False,
task='continuation'):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand All @@ -490,7 +490,7 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModel.load_low_bit(model_path, optimize_model=True,
trust_remote_code=True, use_cache=True,
cpu_embedding=cpu_embedding,
torch_dtype=torch_dtype).eval()
torch_dtype=torch_dtype).eval()
else:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
trust_remote_code=True, use_cache=True,
Expand All @@ -507,7 +507,7 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
trust_remote_code=True, use_cache=True,
trust_remote_code=True, use_cache=True,
cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
Expand Down Expand Up @@ -632,14 +632,14 @@ def transformers_int4_npu_win(repo_id,
st = time.perf_counter()
if repo_id in MINICPM_V_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
model = model.llm
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
save_directory=save_directory, use_cache=True, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
Expand Down Expand Up @@ -707,7 +707,7 @@ def transformers_int4_npu_pipeline_win(repo_id,
st = time.perf_counter()

model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager",
save_directory=save_directory).eval()
Expand Down Expand Up @@ -843,7 +843,7 @@ def run_transformers_openvino(repo_id,

ov_config = {"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1", "CACHE_DIR": ""}
config_dict = dict(pretrained_model_name_or_path=model_path,
config_dict = dict(pretrained_model_name_or_path=model_path,
trust_remote_code=True,
use_cache=True, low_cpu_mem_usage=True)

Expand Down Expand Up @@ -906,7 +906,7 @@ def run_optimize_model_gpu(repo_id,
num_beams,
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from ipex_llm import optimize_model
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
Expand Down Expand Up @@ -986,7 +986,7 @@ def run_ipex_fp16_gpu(repo_id,
num_beams,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
Expand Down Expand Up @@ -1051,7 +1051,7 @@ def run_bigdl_fp16_gpu(repo_id,
num_beams,
batch_size):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
Expand Down Expand Up @@ -1209,7 +1209,7 @@ def run_transformer_int4_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand Down Expand Up @@ -1338,7 +1338,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand Down Expand Up @@ -1475,7 +1475,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load BigDL-LLM optimized low bit model
st = time.perf_counter()
Expand Down Expand Up @@ -1585,7 +1585,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load BigDL-LLM optimized low bit model
st = time.perf_counter()
Expand Down Expand Up @@ -1972,7 +1972,7 @@ def get_int_from_env(env_keys, default):
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")

from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from ipex_llm import optimize_model
import deepspeed
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
Expand Down Expand Up @@ -2013,7 +2013,7 @@ def get_int_from_env(env_keys, default):
# Move model back to xpu
model = model.to(f'xpu:{local_rank}')

# Modify backend related settings
# Modify backend related settings
if world_size > 1:
get_accelerator().set_device(local_rank)
dist_backend = get_accelerator().communication_backend_name()
Expand Down Expand Up @@ -2215,7 +2215,7 @@ def run_pipeline_parallel_gpu(repo_id,
cpu_embedding,
fp16=False):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM, init_pipeline_parallel
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
init_pipeline_parallel()
model_path = get_model_path(repo_id, local_model_hub)
pipeline_parallel_stages = torch.distributed.get_world_size()
Expand Down Expand Up @@ -2311,7 +2311,7 @@ def run_pipeline_parallel_gpu(repo_id,
transpose_value_cache = True
if 'transpose_value_cache' in conf:
transpose_value_cache = conf['transpose_value_cache']

import pandas as pd
for api in conf.test_api:
global csv_name
Expand Down
28 changes: 2 additions & 26 deletions python/llm/src/ipex_llm/transformers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
optimize_lm_head=optimize_lm_head
)
device = module.weight.data.device
from ipex_llm.transformers.utils import get_ipex_version
if get_ipex_version() < "2.1.10+xpu":
new_linear._parameters['weight'] = nn.Parameter(module.weight)
else:
# only from 2.1, ipex provides matmul_bias_out
# so we need to transpose weight
new_weight = module.weight.transpose(0, 1).contiguous()
new_linear._parameters['weight'] = nn.Parameter(new_weight)
new_linear.weight_type = 2
new_linear._parameters['weight'] = nn.Parameter(module.weight)
if module.bias is not None:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device)
new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
elif qtype == ggml_tensor_qtype["bf16"]:
module.to(torch.bfloat16)
if _USE_VLLM:
Expand Down Expand Up @@ -1452,21 +1443,6 @@ def _optimize_post(model):
module.MultiheadAttention,
mpt_multihead_attention_forward
)
elif "gptj" in model.config.model_type:
# dolly-v1-6b
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
gptj_block_forward
convert_forward(model,
module.GPTJAttention,
gptj_attention_forward)
convert_forward(model,
module.GPTJModel,
gptj_model_forward)
convert_forward(model,
module.GPTJBlock,
gptj_block_forward)
elif "bloom" in model.config.model_type:
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from datetime import date
import argparse
from ipex_llm.utils.common import invalidInputError
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer

LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']

Expand Down
30 changes: 8 additions & 22 deletions python/llm/src/ipex_llm/transformers/low_bit_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,9 +759,9 @@ def __init__(self, input_features, output_features, bias=True,
self.weight_length = self.out_len * self.in_len
self.qtype = ggml_tensor_qtype["fp16"]
self.mp_group = mp_group
# weigh_type = 1 means original weight
# weigh_type = 2 means weight has been transposed
# weigh_type = 3 means weight has been transposed by esimd method
# weight_type = 1 means original weight
# weight_type = 2 means weight has been transposed
# weight_type = 3 means weight has been transposed by esimd method
self.weight_type = 1
self.optimize_lm_head = optimize_lm_head
self.disable_fp16_opt = False
Expand All @@ -775,28 +775,14 @@ def forward(self, x: torch.Tensor):

x = x.to(torch.float16)
if self.bias is not None and self.bias.dtype != x.dtype:
self.bias.data = self.bias.data.to(x.dtype)
self.bias.data = self.bias.data.to(x.dtype)
if self.weight is not None and self.weight.dtype != x.dtype:
self.weight.data = self.weight.data.to(x.dtype)

if not self.use_esimd_kernel(x):
if (
get_ipex_version() < "2.1.10+xpu"
or get_xpu_device_name(x.device) not in ["arc", "pvc"]
or self.disable_fp16_opt
):
if self.weight_type == 2:
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
requires_grad=False)
self.weight_type = 1
result = F.linear(x, self.weight, self.bias)
else:
if self.weight_type == 1:
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
requires_grad=False)
self.weight_type = 2
result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
self.weight, self.bias)
invalidInputError(self.weight_type == 1, "weight_type should be 1")
result = F.linear(x, self.weight, self.bias)

if self.mp_group is not None:
if get_use_vllm():
result = self.mp_group.all_reduce(result)
Expand Down Expand Up @@ -852,7 +838,7 @@ def use_esimd_kernel(self, x):
if self.disable_fp16_opt:
return False
# esimd kernel can only be used for Arc and Flex
if gpu_type not in ["arc", "flex"]:
if gpu_type not in ["arc"]:
return False
# now esimd kernel can only be used for specific cases (llama2-7b shape)
if self.in_len == 11008 and self.out_features == 4096:
Expand Down
7 changes: 0 additions & 7 deletions python/llm/src/ipex_llm/transformers/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
self.to(origin_device)


def _load_pre():
from transformers import GPTJModel
from ipex_llm.transformers.models.gptj import gptj_model_new_init
GPTJModel.__init__ = gptj_model_new_init


class _BaseAutoModelClass:
HF_MODEL = None

Expand Down Expand Up @@ -495,7 +489,6 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
else:
if quant_config is not None:
kwargs["quantization_config"] = quant_config
_load_pre()
try:
# To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
kwargs.pop('device_map', None)
Expand Down
Loading

0 comments on commit ccf618f

Please sign in to comment.