diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index c8a5dd467ae..461d9a7012b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -162,7 +162,8 @@ def __init__( self.zero = None if group_size != 0: self.scale = Parameter(scale, requires_grad=False) - self.zero = Parameter(zero, requires_grad=False) + if zero is not None: + self.zero = Parameter(zero, requires_grad=False) else: if self.weight.dtype == torch.uint8: # Int4 we need to double the input channels because weights are compressed diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index 687fb7aa755..fbccd683d70 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -21,6 +21,7 @@ from typing import Sequence from intel_npu_acceleration_library.backend.factory import NNFactory import numpy as np +import torch def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True, @@ -170,3 +171,48 @@ def __init__( print("start compiling") self.compile() + + +def obtain_weight_from_single_layer(attn_layer, mlp_layer): + weights = [] + if hasattr(attn_layer, "q_proj_dq_list"): + for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, + attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list]: + l_weights = [] + scales = [] + zeros = [] + for l in layer_list: + l_weights.append(l.weight) + scales.append(l.scale) + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), + torch.stack(scales, axis=0))) + else: + for layer in [attn_layer.q_proj, attn_layer.k_proj, + attn_layer.v_proj, attn_layer.o_proj, + mlp_layer.gate_proj, mlp_layer.up_proj, + mlp_layer.down_proj]: + if layer.zero is not None: + weights.append((layer.weight, layer.scale, layer.zero)) + else: + weights.append((layer.weight, layer.scale)) + return weights + + +def obtain_qkv_bias_from_single_layer(attn_layer): + if hasattr(attn_layer, "q_proj_dq_list"): + q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16) + k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16) + v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16) + else: + q_bias = attn_layer.q_proj.bias.to(torch.float16) + k_bias = attn_layer.k_proj.bias.to(torch.float16) + v_bias = attn_layer.v_proj.bias.to(torch.float16) + return q_bias, k_bias, v_bias diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py index 03700b053cc..aebff3d6483 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py @@ -18,7 +18,8 @@ import torch import numpy as np import os -from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead +from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \ + obtain_weight_from_single_layer from intel_npu_acceleration_library.backend.factory import NNFactory @@ -261,26 +262,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"): # llama-2-7B & llama-3-8B cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) @@ -400,32 +382,11 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow input_layer_norm_weights = [] post_attn_layernorm_weights = [] layer_indexs = range(layer_start, layer_end) - n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) - n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"): # llama-2-7B & llama-3-8B cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py index 9e89584a035..9eddce77e95 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py @@ -18,7 +18,7 @@ import torch import numpy as np import os -from .common import update_names_of_IR_and_export_blob +from .common import update_names_of_IR_and_export_blob, obtain_weight_from_single_layer from intel_npu_acceleration_library.backend.factory import NNFactory from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory from typing import Sequence @@ -309,26 +309,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) @@ -425,32 +406,11 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d input_layer_norm_weights = [] post_attn_layernorm_weights = [] layer_indexs = range(layer_start, layer_end) - n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) - n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index bb8003f06a7..ffe2707d9a5 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -18,7 +18,8 @@ import torch import numpy as np import os -from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead +from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \ + obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead @@ -132,29 +133,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - - q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16) - k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16) - v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16) + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) + q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) @@ -263,32 +243,11 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down k_biases = [] v_biases = [] layer_indexs = range(layer_start, layer_end) - n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) - n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - - weights = [] - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: - l_weights = [] - scales = [] - zeros = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - if l.zero is not None: - zeros.append(l.zero) - if len(zeros): - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(zeros, axis=0))) - else: - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - + weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) @@ -297,9 +256,10 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down layer_weights.extend(weights) input_layer_norm_weights.append(layer_norm_0) post_attn_layernorm_weights.append(layer_norm_1) - q_biases.append(attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)) - k_biases.append(attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)) - v_biases.append(attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)) + q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer) + q_biases.append(q_bias) + k_biases.append(k_bias) + v_biases.append(v_bias) # save weight input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")