diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index 16030396957..fbccd683d70 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -29,7 +29,6 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True xml_path = os.path.join(dir, model_name + ".xml") bin_path = os.path.join(dir, model_name + ".bin") model.serialize(xml_path, bin_path) - # model.save(xml_path) new_ir_path = os.path.join(dir, model_name + "_new.xml") new_bin_path = os.path.join(dir, model_name + "_new.bin") blob_path = os.path.join(dir, model_name + ".blob") @@ -178,9 +177,9 @@ def obtain_weight_from_single_layer(attn_layer, mlp_layer): weights = [] if hasattr(attn_layer, "q_proj_dq_list"): for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: + attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] zeros = [] @@ -197,9 +196,9 @@ def obtain_weight_from_single_layer(attn_layer, mlp_layer): torch.stack(scales, axis=0))) else: for layer in [attn_layer.q_proj, attn_layer.k_proj, - attn_layer.v_proj, attn_layer.o_proj, - mlp_layer.gate_proj, mlp_layer.up_proj, - mlp_layer.down_proj]: + attn_layer.v_proj, attn_layer.o_proj, + mlp_layer.gate_proj, mlp_layer.up_proj, + mlp_layer.down_proj]: if layer.zero is not None: weights.append((layer.weight, layer.scale, layer.zero)) else: diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index d8d11b9e1dc..ffe2707d9a5 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -134,7 +134,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp weights = obtain_weight_from_single_layer(attn_layer, mlp_layer) - q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer + q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) @@ -256,7 +256,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down layer_weights.extend(weights) input_layer_norm_weights.append(layer_norm_0) post_attn_layernorm_weights.append(layer_norm_1) - q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer + q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer) q_biases.append(q_bias) k_biases.append(k_bias) v_biases.append(v_bias)