Skip to content

Commit

Permalink
[NPU] Compatible with other third-party models like auto-round (#12620)
Browse files Browse the repository at this point in the history
* support third party model

* simplify code

* fix sty;e

* fix sym int4 GW

* code refactor

* fix
  • Loading branch information
rnwang04 authored Dec 26, 2024
1 parent a9abde0 commit bbdbbb0
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 136 deletions.
3 changes: 2 additions & 1 deletion python/llm/src/ipex_llm/transformers/npu_models/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ def __init__(
self.zero = None
if group_size != 0:
self.scale = Parameter(scale, requires_grad=False)
self.zero = Parameter(zero, requires_grad=False)
if zero is not None:
self.zero = Parameter(zero, requires_grad=False)
else:
if self.weight.dtype == torch.uint8:
# Int4 we need to double the input channels because weights are compressed
Expand Down
46 changes: 46 additions & 0 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import Sequence
from intel_npu_acceleration_library.backend.factory import NNFactory
import numpy as np
import torch


def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
Expand Down Expand Up @@ -170,3 +171,48 @@ def __init__(

print("start compiling")
self.compile()


def obtain_weight_from_single_layer(attn_layer, mlp_layer):
weights = []
if hasattr(attn_layer, "q_proj_dq_list"):
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0),
torch.stack(scales, axis=0)))
else:
for layer in [attn_layer.q_proj, attn_layer.k_proj,
attn_layer.v_proj, attn_layer.o_proj,
mlp_layer.gate_proj, mlp_layer.up_proj,
mlp_layer.down_proj]:
if layer.zero is not None:
weights.append((layer.weight, layer.scale, layer.zero))
else:
weights.append((layer.weight, layer.scale))
return weights


def obtain_qkv_bias_from_single_layer(attn_layer):
if hasattr(attn_layer, "q_proj_dq_list"):
q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)
k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)
v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)
else:
q_bias = attn_layer.q_proj.bias.to(torch.float16)
k_bias = attn_layer.k_proj.bias.to(torch.float16)
v_bias = attn_layer.v_proj.bias.to(torch.float16)
return q_bias, k_bias, v_bias
47 changes: 4 additions & 43 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import torch
import numpy as np
import os
from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead
from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
obtain_weight_from_single_layer
from intel_npu_acceleration_library.backend.factory import NNFactory


Expand Down Expand Up @@ -261,26 +262,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
# llama-2-7B & llama-3-8B
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
Expand Down Expand Up @@ -400,32 +382,11 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
input_layer_norm_weights = []
post_attn_layernorm_weights = []
layer_indexs = range(layer_start, layer_end)
n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
for layer_idx in layer_indexs:
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
# llama-2-7B & llama-3-8B
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
Expand Down
46 changes: 3 additions & 43 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/minicpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import torch
import numpy as np
import os
from .common import update_names_of_IR_and_export_blob
from .common import update_names_of_IR_and_export_blob, obtain_weight_from_single_layer
from intel_npu_acceleration_library.backend.factory import NNFactory
from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory
from typing import Sequence
Expand Down Expand Up @@ -309,26 +309,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
Expand Down Expand Up @@ -425,32 +406,11 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
input_layer_norm_weights = []
post_attn_layernorm_weights = []
layer_indexs = range(layer_start, layer_end)
n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
for layer_idx in layer_indexs:
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
Expand Down
58 changes: 9 additions & 49 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import torch
import numpy as np
import os
from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead
from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead


Expand Down Expand Up @@ -132,29 +133,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)
k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)
v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
Expand Down Expand Up @@ -263,32 +243,11 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
k_biases = []
v_biases = []
layer_indexs = range(layer_start, layer_end)
n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
for layer_idx in layer_indexs:
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
mlp_layer.down_proj_dq_list]:
l_weights = []
scales = []
zeros = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
if l.zero is not None:
zeros.append(l.zero)
if len(zeros):
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
Expand All @@ -297,9 +256,10 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
layer_weights.extend(weights)
input_layer_norm_weights.append(layer_norm_0)
post_attn_layernorm_weights.append(layer_norm_1)
q_biases.append(attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16))
k_biases.append(attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16))
v_biases.append(attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16))
q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
q_biases.append(q_bias)
k_biases.append(k_bias)
v_biases.append(v_bias)

# save weight
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
Expand Down

0 comments on commit bbdbbb0

Please sign in to comment.