Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove all ipex usage #12666

Merged
merged 3 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions python/llm/dev/benchmark/all-in-one/run-stress-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def run_transformer_int4_gpu(repo_id,
num_beams,
low_bit):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
import intel_extension_for_pytorch as ipex
reserved_mem_list = []
model_path = get_model_path(repo_id, local_model_hub)
Expand All @@ -170,9 +170,6 @@ def run_transformer_int4_gpu(repo_id,
trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
if isinstance(model, GPTJForCausalLM):
# For gpt-j model family, this optimization can provide a better performance.
model = ipex.optimize(model.eval(), inplace=True)
end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st))
reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
Expand Down Expand Up @@ -227,7 +224,7 @@ def run_transformer_int4_gpu(repo_id,
today = date.today()
if 'exclude' in conf:
excludes = conf['exclude']

import pandas as pd
for api in conf.test_api:
for model in conf.repo_id:
Expand All @@ -240,7 +237,7 @@ def run_transformer_int4_gpu(repo_id,
run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
conf['low_bit'], conf['cpu_embedding'])
df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'peak mem (GB)'])

df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
Expand Down
42 changes: 21 additions & 21 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def preprocess_prompt(tokenizer, in_len, task):
elif in_len == 4096:
input_str = open(f"prompt/QA/orca_497.txt", 'r', encoding='utf-8').read()
else:
raise ValueError("No corresponding prompt available now, will be added later.")
input_ids = tokenizer.encode(input_str, return_tensors="pt")
raise ValueError("No corresponding prompt available now, will be added later.")
input_ids = tokenizer.encode(input_str, return_tensors="pt")
return input_ids

def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1, streaming=False, use_fp16_torch_dtype=False, lookahead=False, task='continuation', optimize_model=False, transpose_value_cache=True, group_size=64):
Expand Down Expand Up @@ -222,7 +222,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
streaming if 'win' in test_api else 'N/A',
use_fp16_torch_dtype if 'pipeline_parallel_gpu' in test_api else 'N/A',
group_size if any(keyword in test_api for keyword in ['transformers_int4_npu_win', 'transformers_int4_npu_pipeline_win']) else 'N/A'],
)
)


def get_model_path(repo_id, local_model_hub):
Expand Down Expand Up @@ -475,7 +475,7 @@ def run_transformer_int4_gpu(repo_id,
lookahead=False,
task='continuation'):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand All @@ -490,7 +490,7 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModel.load_low_bit(model_path, optimize_model=True,
trust_remote_code=True, use_cache=True,
cpu_embedding=cpu_embedding,
torch_dtype=torch_dtype).eval()
torch_dtype=torch_dtype).eval()
else:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
trust_remote_code=True, use_cache=True,
Expand All @@ -507,7 +507,7 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
trust_remote_code=True, use_cache=True,
trust_remote_code=True, use_cache=True,
cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
Expand Down Expand Up @@ -632,14 +632,14 @@ def transformers_int4_npu_win(repo_id,
st = time.perf_counter()
if repo_id in MINICPM_V_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
save_directory=save_directory, attn_implementation="eager", torch_dtype=torch.float16).eval()
model = model.llm
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
save_directory=save_directory, use_cache=True, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
Expand Down Expand Up @@ -707,7 +707,7 @@ def transformers_int4_npu_pipeline_win(repo_id,
st = time.perf_counter()

model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager",
save_directory=save_directory).eval()
Expand Down Expand Up @@ -843,7 +843,7 @@ def run_transformers_openvino(repo_id,

ov_config = {"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1", "CACHE_DIR": ""}
config_dict = dict(pretrained_model_name_or_path=model_path,
config_dict = dict(pretrained_model_name_or_path=model_path,
trust_remote_code=True,
use_cache=True, low_cpu_mem_usage=True)

Expand Down Expand Up @@ -906,7 +906,7 @@ def run_optimize_model_gpu(repo_id,
num_beams,
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from ipex_llm import optimize_model
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
Expand Down Expand Up @@ -986,7 +986,7 @@ def run_ipex_fp16_gpu(repo_id,
num_beams,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
Expand Down Expand Up @@ -1051,7 +1051,7 @@ def run_bigdl_fp16_gpu(repo_id,
num_beams,
batch_size):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
Expand Down Expand Up @@ -1209,7 +1209,7 @@ def run_transformer_int4_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand Down Expand Up @@ -1338,7 +1338,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
Expand Down Expand Up @@ -1475,7 +1475,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load BigDL-LLM optimized low bit model
st = time.perf_counter()
Expand Down Expand Up @@ -1585,7 +1585,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
batch_size,
streaming):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
from transformers import AutoTokenizer, LlamaTokenizer, TextStreamer
model_path = get_model_path(repo_id, local_model_hub)
# Load BigDL-LLM optimized low bit model
st = time.perf_counter()
Expand Down Expand Up @@ -1972,7 +1972,7 @@ def get_int_from_env(env_keys, default):
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")

from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from ipex_llm import optimize_model
import deepspeed
from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
Expand Down Expand Up @@ -2013,7 +2013,7 @@ def get_int_from_env(env_keys, default):
# Move model back to xpu
model = model.to(f'xpu:{local_rank}')

# Modify backend related settings
# Modify backend related settings
if world_size > 1:
get_accelerator().set_device(local_rank)
dist_backend = get_accelerator().communication_backend_name()
Expand Down Expand Up @@ -2215,7 +2215,7 @@ def run_pipeline_parallel_gpu(repo_id,
cpu_embedding,
fp16=False):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM, init_pipeline_parallel
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer
init_pipeline_parallel()
model_path = get_model_path(repo_id, local_model_hub)
pipeline_parallel_stages = torch.distributed.get_world_size()
Expand Down Expand Up @@ -2311,7 +2311,7 @@ def run_pipeline_parallel_gpu(repo_id,
transpose_value_cache = True
if 'transpose_value_cache' in conf:
transpose_value_cache = conf['transpose_value_cache']

import pandas as pd
for api in conf.test_api:
global csv_name
Expand Down
28 changes: 2 additions & 26 deletions python/llm/src/ipex_llm/transformers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
optimize_lm_head=optimize_lm_head
)
device = module.weight.data.device
from ipex_llm.transformers.utils import get_ipex_version
if get_ipex_version() < "2.1.10+xpu":
new_linear._parameters['weight'] = nn.Parameter(module.weight)
else:
# only from 2.1, ipex provides matmul_bias_out
# so we need to transpose weight
new_weight = module.weight.transpose(0, 1).contiguous()
new_linear._parameters['weight'] = nn.Parameter(new_weight)
new_linear.weight_type = 2
new_linear._parameters['weight'] = nn.Parameter(module.weight)
if module.bias is not None:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device)
new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
elif qtype == ggml_tensor_qtype["bf16"]:
module.to(torch.bfloat16)
if _USE_VLLM:
Expand Down Expand Up @@ -1452,21 +1443,6 @@ def _optimize_post(model):
module.MultiheadAttention,
mpt_multihead_attention_forward
)
elif "gptj" in model.config.model_type:
# dolly-v1-6b
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
gptj_block_forward
convert_forward(model,
module.GPTJAttention,
gptj_attention_forward)
convert_forward(model,
module.GPTJModel,
gptj_model_forward)
convert_forward(model,
module.GPTJBlock,
gptj_block_forward)
elif "bloom" in model.config.model_type:
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from datetime import date
import argparse
from ipex_llm.utils.common import invalidInputError
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, LlamaTokenizer

LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']

Expand Down
30 changes: 8 additions & 22 deletions python/llm/src/ipex_llm/transformers/low_bit_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,9 +759,9 @@ def __init__(self, input_features, output_features, bias=True,
self.weight_length = self.out_len * self.in_len
self.qtype = ggml_tensor_qtype["fp16"]
self.mp_group = mp_group
# weigh_type = 1 means original weight
# weigh_type = 2 means weight has been transposed
# weigh_type = 3 means weight has been transposed by esimd method
# weight_type = 1 means original weight
# weight_type = 2 means weight has been transposed
# weight_type = 3 means weight has been transposed by esimd method
self.weight_type = 1
self.optimize_lm_head = optimize_lm_head
self.disable_fp16_opt = False
Expand All @@ -775,28 +775,14 @@ def forward(self, x: torch.Tensor):

x = x.to(torch.float16)
if self.bias is not None and self.bias.dtype != x.dtype:
self.bias.data = self.bias.data.to(x.dtype)
self.bias.data = self.bias.data.to(x.dtype)
if self.weight is not None and self.weight.dtype != x.dtype:
self.weight.data = self.weight.data.to(x.dtype)

if not self.use_esimd_kernel(x):
if (
get_ipex_version() < "2.1.10+xpu"
or get_xpu_device_name(x.device) not in ["arc", "pvc"]
or self.disable_fp16_opt
):
if self.weight_type == 2:
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
requires_grad=False)
self.weight_type = 1
result = F.linear(x, self.weight, self.bias)
else:
if self.weight_type == 1:
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
requires_grad=False)
self.weight_type = 2
result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
self.weight, self.bias)
invalidInputError(self.weight_type == 1, "weight_type should be 1")
result = F.linear(x, self.weight, self.bias)

if self.mp_group is not None:
if get_use_vllm():
result = self.mp_group.all_reduce(result)
Expand Down Expand Up @@ -852,7 +838,7 @@ def use_esimd_kernel(self, x):
if self.disable_fp16_opt:
return False
# esimd kernel can only be used for Arc and Flex
if gpu_type not in ["arc", "flex"]:
if gpu_type not in ["arc"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why remove flex here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flex is now recognized as "arc", no need to handle it any more

return False
# now esimd kernel can only be used for specific cases (llama2-7b shape)
if self.in_len == 11008 and self.out_features == 4096:
Expand Down
7 changes: 0 additions & 7 deletions python/llm/src/ipex_llm/transformers/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
self.to(origin_device)


def _load_pre():
from transformers import GPTJModel
from ipex_llm.transformers.models.gptj import gptj_model_new_init
GPTJModel.__init__ = gptj_model_new_init


class _BaseAutoModelClass:
HF_MODEL = None

Expand Down Expand Up @@ -495,7 +489,6 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
else:
if quant_config is not None:
kwargs["quantization_config"] = quant_config
_load_pre()
try:
# To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
kwargs.pop('device_map', None)
Expand Down
Loading
Loading