Skip to content

Commit

Permalink
LLM: Optimize cohere model (#10878)
Browse files Browse the repository at this point in the history
* use mlp and rms

* optimize kv_cache

* add fuse qkv

* add flash attention and fp16 sdp

* error fp8 sdp

* fix optimized

* fix style

* update

* add for pp
  • Loading branch information
hzjane authored May 7, 2024
1 parent 13a44cd commit 191b184
Show file tree
Hide file tree
Showing 2 changed files with 475 additions and 0 deletions.
18 changes: 18 additions & 0 deletions python/llm/src/ipex_llm/transformers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1282,6 +1282,24 @@ def _optimize_post(model, lightweight_bmm=False):
convert_forward(model,
module.Qwen2MoeAttention,
qwen2moe_attention_forward)
elif model.config.model_type == "cohere":
# for CohereForAI/c4ai-command-r-v01
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from ipex_llm.transformers.models.cohere import cohere_attention_forward
from ipex_llm.transformers.models.cohere import cohere_model_forward
convert_forward(model,
module.CohereModel,
cohere_model_forward)
convert_forward(model,
module.CohereAttention,
cohere_attention_forward)
convert_forward(model,
module.CohereLayerNorm,
llama_rms_norm_forward)
convert_forward(model,
module.CohereMLP,
llama_mlp_forward)
elif model.config.model_type == "aquila":
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
Expand Down
Loading

0 comments on commit 191b184

Please sign in to comment.