From 74d01dc6a30b7388d8a9668a124fd46614fc3d9c Mon Sep 17 00:00:00 2001 From: plusbang Date: Tue, 10 Dec 2024 17:18:49 +0800 Subject: [PATCH 1/5] avoid compile error --- python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py index 69f618c888a..fd03d0426ab 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py @@ -230,10 +230,9 @@ def build_decoder( ): residual = hidden_states - input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) - input_2d = self.layer_norm(input_2d, input_layernorm_weight) + hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) attn_output, new_key_states, new_value_states = self.attention( - hidden_states=input_2d, + hidden_states=hidden_states, position_ids=position_ids, attention_mask=attention_mask, past_key=past_key, From e6bf29538e4efe8f5d0191bcc2eb40df683e7897 Mon Sep 17 00:00:00 2001 From: plusbang Date: Wed, 11 Dec 2024 15:04:13 +0800 Subject: [PATCH 2/5] debug --- .../src/ipex_llm/transformers/npu_model.py | 46 ++++----- .../ipex_llm/transformers/npu_models/llama.py | 8 +- .../transformers/npu_models/llama_mp.py | 99 +++++++++++++++---- 3 files changed, 112 insertions(+), 41 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 673a56ecfaa..b3ce91cf0ab 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -301,29 +301,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if (not hasattr(model, 'llm') and - model.config.model_type in ["qwen2", "llama", "minicpm"]): - from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - optimize_llm_single_process( - llm, - kv_len=max_context_len, - max_prompt_len=max_prompt_len, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size, - qtype=qtype, - save_directory=save_directory, - fuse_layers=fuse_layers - ) - else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + # if (not hasattr(model, 'llm') and + # model.config.model_type in ["qwen2", "llama", "minicpm"]): + # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + # optimize_llm_single_process( + # llm, + # kv_len=max_context_len, + # max_prompt_len=max_prompt_len, + # transpose_value_cache=transpose_value_cache, + # group_size=quantization_group_size, + # qtype=qtype, + # save_directory=save_directory, + # fuse_layers=fuse_layers + # ) + # else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama.py b/python/llm/src/ipex_llm/transformers/npu_models/llama.py index ab4c2025a25..4447311cdc6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama.py @@ -128,7 +128,7 @@ def llama_model_forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) @@ -144,6 +144,9 @@ def llama_model_forward( cache_position, ) else: + print(f'Before running {idx} decoder layer, hidden_states is {hidden_states}') + print(f'Before running {idx} decoder layer, causal_mask is {causal_mask}') + print(f'Before running {idx} decoder layer, position_ids is {position_ids}') layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask, @@ -155,6 +158,8 @@ def llama_model_forward( ) hidden_states = layer_outputs[0] + print(f'After running {idx} decoder layer, hidden_states is {hidden_states}') + print('===============') if use_cache: next_decoder_cache = layer_outputs[2 if output_attentions else 1] @@ -162,6 +167,7 @@ def llama_model_forward( if output_attentions: all_self_attns += (layer_outputs[1],) + print(f'run_prefill result, hidden_states is {hidden_states}') hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py index fd03d0426ab..b8de1b11be8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py @@ -188,18 +188,32 @@ def __init__( curr_key_values = [] cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids) for i in range(num_layers): - hidden_states, new_key_states, new_value_states = self.build_decoder( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids if cos_condition else None, - input_layernorm_weight=input_layernorm_weights[i], - post_attention_layernorm_weight=post_attn_layernorm_weights[i], - past_key=past_keys[i], - past_value=past_values[i], - use_prefill_sdp=use_prefill_sdp, - cos=self.cos, - sin=self.sin, - ) + if self.mode == "prefill": + hidden_states, new_key_states, new_value_states, temp = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids if cos_condition else None, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + past_key=past_keys[i], + past_value=past_values[i], + use_prefill_sdp=use_prefill_sdp, + cos=self.cos, + sin=self.sin, + ) + else: + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids if cos_condition else None, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + past_key=past_keys[i], + past_value=past_values[i], + use_prefill_sdp=use_prefill_sdp, + cos=self.cos, + sin=self.sin, + ) curr_key_values.append((new_key_states, new_value_states)) # define outputs @@ -209,6 +223,9 @@ def __init__( new_key_states = self.convert_to_fp16(curr_key_values[i][0]) new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + if self.mode == "prefill": + temp = self.convert_to_fp16(temp) + print("start compiling") if mode == "prefill" and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1": self.compile(npu_dpu_groups=6) @@ -230,9 +247,42 @@ def build_decoder( ): residual = hidden_states - hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) + # if self.mode == "prefill": + # temp = hidden_states # first exec is okay + # hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) + # # if self.mode == "prefill": + # # temp = hidden_states # first exec is wrong + if self.mode == "prefill": + # temp = hidden_states # first exec is okay + hidden_states = self.convert_to_fp32(hidden_states) + variance = self.reduce_mean( + self.power(hidden_states, self.constant(np.array([[2]], dtype=np.float32))), + -1, + keep_dims=True, + ) + # temp = variance # first exec is okay + eps = self.constant(self.rms_norm_eps) + hidden_states = self.eltwise_div(hidden_states, self.sqrt(self.eltwise_add(variance, eps))) + # temp = hidden_states # first exec is okay + input_layernorm_weight = self.convert_to_fp32(input_layernorm_weight) + + mul_res = self.eltwise_mul(input_layernorm_weight, hidden_states) + temp = mul_res + hidden_states = temp + + # mul_res = self.eltwise_mul(input_layernorm_weight, hidden_states) + # hidden_states = mul_res + # temp = hidden_states + + # temp = self.eltwise_mul(input_layernorm_weight, hidden_states) + + # hidden_states = self.eltwise_mul(input_layernorm_weight, hidden_states) + hidden_states = self.convert_to_fp16(hidden_states) + else: + hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) + input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) attn_output, new_key_states, new_value_states = self.attention( - hidden_states=hidden_states, + hidden_states=input_2d, position_ids=position_ids, attention_mask=attention_mask, past_key=past_key, @@ -253,7 +303,11 @@ def build_decoder( hidden_states = self.eltwise_add(residual, hidden_states) hidden_states = self.convert_to_fp16(hidden_states) - return hidden_states, new_key_states, new_value_states + if self.mode == "prefill": + return hidden_states, new_key_states, new_value_states, temp + else: + return hidden_states, new_key_states, new_value_states + # return hidden_states, new_key_states, new_value_states class FusedLlamaLowBitMultiDecoderlayer(torch.nn.Module): @@ -499,9 +553,13 @@ def forward( if self.cached_cos is None: inputs += (cos.to(torch.float32), sin.to(torch.float32),) inputs += (self.layer_norm_0, self.layer_norm_1) - hidden_states, past_key, past_value = run_model( + hidden_states, past_key, past_value, temp = run_model( inputs, self.op_parameters, backend_cls, self.op_id, replica=2 ) + print(f'[DEBUG INFO OF TEMP VAR] temp is {temp}') + # hidden_states, past_key, past_value = run_model( + # inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + # ) cache_kwargs = { "cache_position": cache_position, "max_seq_len": self.max_seq_len, @@ -870,7 +928,11 @@ def run_prefill( result_queue.put("loading finish") with torch.inference_mode(): - for decoder_layer in deocderlayers: + # for decoder_layer in deocderlayers: + for idx, decoder_layer in enumerate(deocderlayers): + print(f'Before running {idx} decoder layer, hidden_states is {hidden_states}') + print(f'Before running {idx} decoder layer, causal_mask is {causal_mask}') + print(f'Before running {idx} decoder layer, position_ids is {position_ids}') layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask, @@ -884,8 +946,11 @@ def run_prefill( ) hidden_states = layer_outputs[0] + print(f'After running {idx} decoder layer, hidden_states is {hidden_states}') + print('===============') next_decoder_cache = layer_outputs[1] + print(f'run_prefill result, hidden_states is {hidden_states}') result_queue.put((hidden_states, next_decoder_cache)) From fcaf71be5a4bc9f64c251e063385c9dd34be0d2d Mon Sep 17 00:00:00 2001 From: plusbang Date: Thu, 12 Dec 2024 13:45:44 +0800 Subject: [PATCH 3/5] fix --- .../src/ipex_llm/transformers/npu_model.py | 46 ++++----- .../transformers/npu_models/llama_mp.py | 96 ++++--------------- .../transformers/npu_models/mp_models_base.py | 3 +- 3 files changed, 41 insertions(+), 104 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index b3ce91cf0ab..673a56ecfaa 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -301,29 +301,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - # if (not hasattr(model, 'llm') and - # model.config.model_type in ["qwen2", "llama", "minicpm"]): - # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - # optimize_llm_single_process( - # llm, - # kv_len=max_context_len, - # max_prompt_len=max_prompt_len, - # transpose_value_cache=transpose_value_cache, - # group_size=quantization_group_size, - # qtype=qtype, - # save_directory=save_directory, - # fuse_layers=fuse_layers - # ) - # else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + if (not hasattr(model, 'llm') and + model.config.model_type in ["qwen2", "llama", "minicpm"]): + from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + optimize_llm_single_process( + llm, + kv_len=max_context_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size, + qtype=qtype, + save_directory=save_directory, + fuse_layers=fuse_layers + ) + else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py index b8de1b11be8..69f618c888a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py @@ -188,32 +188,18 @@ def __init__( curr_key_values = [] cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids) for i in range(num_layers): - if self.mode == "prefill": - hidden_states, new_key_states, new_value_states, temp = self.build_decoder( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids if cos_condition else None, - input_layernorm_weight=input_layernorm_weights[i], - post_attention_layernorm_weight=post_attn_layernorm_weights[i], - past_key=past_keys[i], - past_value=past_values[i], - use_prefill_sdp=use_prefill_sdp, - cos=self.cos, - sin=self.sin, - ) - else: - hidden_states, new_key_states, new_value_states = self.build_decoder( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids if cos_condition else None, - input_layernorm_weight=input_layernorm_weights[i], - post_attention_layernorm_weight=post_attn_layernorm_weights[i], - past_key=past_keys[i], - past_value=past_values[i], - use_prefill_sdp=use_prefill_sdp, - cos=self.cos, - sin=self.sin, - ) + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids if cos_condition else None, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + past_key=past_keys[i], + past_value=past_values[i], + use_prefill_sdp=use_prefill_sdp, + cos=self.cos, + sin=self.sin, + ) curr_key_values.append((new_key_states, new_value_states)) # define outputs @@ -223,9 +209,6 @@ def __init__( new_key_states = self.convert_to_fp16(curr_key_values[i][0]) new_value_states = self.convert_to_fp16(curr_key_values[i][1]) - if self.mode == "prefill": - temp = self.convert_to_fp16(temp) - print("start compiling") if mode == "prefill" and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1": self.compile(npu_dpu_groups=6) @@ -247,40 +230,8 @@ def build_decoder( ): residual = hidden_states - # if self.mode == "prefill": - # temp = hidden_states # first exec is okay - # hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) - # # if self.mode == "prefill": - # # temp = hidden_states # first exec is wrong - if self.mode == "prefill": - # temp = hidden_states # first exec is okay - hidden_states = self.convert_to_fp32(hidden_states) - variance = self.reduce_mean( - self.power(hidden_states, self.constant(np.array([[2]], dtype=np.float32))), - -1, - keep_dims=True, - ) - # temp = variance # first exec is okay - eps = self.constant(self.rms_norm_eps) - hidden_states = self.eltwise_div(hidden_states, self.sqrt(self.eltwise_add(variance, eps))) - # temp = hidden_states # first exec is okay - input_layernorm_weight = self.convert_to_fp32(input_layernorm_weight) - - mul_res = self.eltwise_mul(input_layernorm_weight, hidden_states) - temp = mul_res - hidden_states = temp - - # mul_res = self.eltwise_mul(input_layernorm_weight, hidden_states) - # hidden_states = mul_res - # temp = hidden_states - - # temp = self.eltwise_mul(input_layernorm_weight, hidden_states) - - # hidden_states = self.eltwise_mul(input_layernorm_weight, hidden_states) - hidden_states = self.convert_to_fp16(hidden_states) - else: - hidden_states = self.layer_norm(hidden_states, input_layernorm_weight) input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) + input_2d = self.layer_norm(input_2d, input_layernorm_weight) attn_output, new_key_states, new_value_states = self.attention( hidden_states=input_2d, position_ids=position_ids, @@ -303,11 +254,7 @@ def build_decoder( hidden_states = self.eltwise_add(residual, hidden_states) hidden_states = self.convert_to_fp16(hidden_states) - if self.mode == "prefill": - return hidden_states, new_key_states, new_value_states, temp - else: - return hidden_states, new_key_states, new_value_states - # return hidden_states, new_key_states, new_value_states + return hidden_states, new_key_states, new_value_states class FusedLlamaLowBitMultiDecoderlayer(torch.nn.Module): @@ -553,13 +500,9 @@ def forward( if self.cached_cos is None: inputs += (cos.to(torch.float32), sin.to(torch.float32),) inputs += (self.layer_norm_0, self.layer_norm_1) - hidden_states, past_key, past_value, temp = run_model( + hidden_states, past_key, past_value = run_model( inputs, self.op_parameters, backend_cls, self.op_id, replica=2 ) - print(f'[DEBUG INFO OF TEMP VAR] temp is {temp}') - # hidden_states, past_key, past_value = run_model( - # inputs, self.op_parameters, backend_cls, self.op_id, replica=2 - # ) cache_kwargs = { "cache_position": cache_position, "max_seq_len": self.max_seq_len, @@ -928,11 +871,7 @@ def run_prefill( result_queue.put("loading finish") with torch.inference_mode(): - # for decoder_layer in deocderlayers: - for idx, decoder_layer in enumerate(deocderlayers): - print(f'Before running {idx} decoder layer, hidden_states is {hidden_states}') - print(f'Before running {idx} decoder layer, causal_mask is {causal_mask}') - print(f'Before running {idx} decoder layer, position_ids is {position_ids}') + for decoder_layer in deocderlayers: layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask, @@ -946,11 +885,8 @@ def run_prefill( ) hidden_states = layer_outputs[0] - print(f'After running {idx} decoder layer, hidden_states is {hidden_states}') - print('===============') next_decoder_cache = layer_outputs[1] - print(f'run_prefill result, hidden_states is {hidden_states}') result_queue.put((hidden_states, next_decoder_cache)) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index a1dac609243..22fc882546a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -470,7 +470,8 @@ def layer_norm(self, hidden_states, layernorm_weight): ) eps = self.constant(self.rms_norm_eps) hidden_states = self.eltwise_div(hidden_states, self.sqrt(self.eltwise_add(variance, eps))) - layernorm_weight = self.convert_to_fp32(layernorm_weight) + # layernorm_weight = self.convert_to_fp32(layernorm_weight) + hidden_states = self.convert_to_fp16(hidden_states) hidden_states = self.eltwise_mul(layernorm_weight, hidden_states) hidden_states = self.convert_to_fp16(hidden_states) return hidden_states From bd998a319b036f9df550cc199a5687e3e0c2917e Mon Sep 17 00:00:00 2001 From: plusbang Date: Thu, 12 Dec 2024 13:50:21 +0800 Subject: [PATCH 4/5] test --- python/llm/src/ipex_llm/transformers/npu_models/llama.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama.py b/python/llm/src/ipex_llm/transformers/npu_models/llama.py index 4447311cdc6..ab4c2025a25 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama.py @@ -128,7 +128,7 @@ def llama_model_forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for idx, decoder_layer in enumerate(self.layers): + for decoder_layer in self.layers: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -144,9 +144,6 @@ def llama_model_forward( cache_position, ) else: - print(f'Before running {idx} decoder layer, hidden_states is {hidden_states}') - print(f'Before running {idx} decoder layer, causal_mask is {causal_mask}') - print(f'Before running {idx} decoder layer, position_ids is {position_ids}') layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask, @@ -158,8 +155,6 @@ def llama_model_forward( ) hidden_states = layer_outputs[0] - print(f'After running {idx} decoder layer, hidden_states is {hidden_states}') - print('===============') if use_cache: next_decoder_cache = layer_outputs[2 if output_attentions else 1] @@ -167,7 +162,6 @@ def llama_model_forward( if output_attentions: all_self_attns += (layer_outputs[1],) - print(f'run_prefill result, hidden_states is {hidden_states}') hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer From 3bb4b7ec753ce83e0194d3ff2b55baa7813ef51b Mon Sep 17 00:00:00 2001 From: plusbang Date: Thu, 12 Dec 2024 14:49:09 +0800 Subject: [PATCH 5/5] rm comment --- .../llm/src/ipex_llm/transformers/npu_models/mp_models_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index 22fc882546a..b25a3dae0fe 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -470,7 +470,6 @@ def layer_norm(self, hidden_states, layernorm_weight): ) eps = self.constant(self.rms_norm_eps) hidden_states = self.eltwise_div(hidden_states, self.sqrt(self.eltwise_add(variance, eps))) - # layernorm_weight = self.convert_to_fp32(layernorm_weight) hidden_states = self.convert_to_fp16(hidden_states) hidden_states = self.eltwise_mul(layernorm_weight, hidden_states) hidden_states = self.convert_to_fp16(hidden_states)