From ad144001e62c9d550358669de7a6e191f971fab3 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Wed, 10 Jul 2024 14:25:36 +0800 Subject: [PATCH] optimize internlm xcomposer2 performance --- .../ipex_llm/transformers/models/internlm.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/models/internlm.py b/python/llm/src/ipex_llm/transformers/models/internlm.py index 1e8634b903f..227293e497d 100644 --- a/python/llm/src/ipex_llm/transformers/models/internlm.py +++ b/python/llm/src/ipex_llm/transformers/models/internlm.py @@ -42,6 +42,7 @@ import torch import torch.utils.checkpoint from torch import nn +from ipex_llm.utils.common.log4Error import invalidInputError from ipex_llm.transformers.models.utils import should_use_fuse_rope, apply_rotary_pos_emb from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache @@ -307,10 +308,16 @@ def pre_process_attn_and_mlp(module: torch.nn.Module): def add_lora(x: torch.Tensor, result: torch.Tensor, im_mask: torch.Tensor = None, lora_scaling: float = 0, Plora_A: torch.nn.Linear = None, Plora_B: torch.nn.Linear = None): - if im_mask is not None and torch.sum(im_mask) > 0: - part_x = x[im_mask] - result[im_mask] += Plora_B(Plora_A(part_x) * lora_scaling) - return result + invalidInputError(x.dim() == 3 and result.dim() == 3, + "`x` and `result` should have 3 dims") + if len(im_mask) == 0 or x.size(1) == 1: + return result + else: + for start_idx, end_idx in im_mask: + result[:, start_idx:end_idx, :] += Plora_B( + Plora_A(x[:, start_idx:end_idx, :]) * lora_scaling + ) + return result def internlm_xcomposser2_attention_forward( @@ -457,18 +464,32 @@ def internlm_xcomposser2_chat( **kwargs, ): # ipex-llm changes start: fix device and dtype conversion + # replace im_mask with start_idx and end_idx to improve performance if image is None: inputs = self.build_inputs(tokenizer, query, history, meta_instruction) - im_mask = torch.zeros(inputs['input_ids'].shape[:2]).bool() + im_mask = [] else: image = self.encode_img(image) inputs, im_mask = self.interleav_wrap_chat(tokenizer, query, image, history, meta_instruction) + mask = im_mask.cpu().flatten().tolist() + length = len(mask) + im_mask = [] + i = 0 + while i < length: + while i < length and not mask[i]: + i = i + 1 + start_idx = i + while i < length and mask[i]: + i = i + 1 + end_idx = i + if start_idx != end_idx: + im_mask.append((start_idx, end_idx)) + inputs = { k: v.to(device=self.device, dtype=self.dtype) for k, v in inputs.items() if torch.is_tensor(v) } - im_mask = im_mask.to(self.device) # ipex-llm changes end # also add end-of-assistant token in eos token id to avoid unnecessary generation