diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index bd737917cb919..3748eb3544dd1 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -16,13 +16,6 @@ PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') -def silu_and_mul(output, input): - d = input.shape[-1] // 2 - silu = torch.nn.SiLU().to(input.device) - x, y = torch.split(input, d, dim=-1) - output.copy_(silu(x) * y) - - def fetch_from_cache(cache, blocks, permutations): return [ cache.index_select(0, blocks[:, i]).permute(permutations) @@ -81,12 +74,9 @@ def paged_attention_v1(query, return attn_weights.squeeze(-2) -def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: +def silu_and_mul(x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - silu_and_mul(out, x) - return out + return F.silu(x[..., :d]) * x[..., d:] def static_fused_moe(hidden_states, w1, w2, score, topk): @@ -111,13 +101,10 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): htorch.core.mark_step() for expert_idx in range(num_experts): - padded_weight = padded_weights[expert_idx] - current_state_static = hidden_states.reshape(-1, D) - w_output = silu_and_mul_wrapper( - torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) + w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1)) + w_output = silu_and_mul(w_output) w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) - current_hidden_states_static = w_output * padded_weight - final_hidden_states += current_hidden_states_static + final_hidden_states += w_output * padded_weights[expert_idx] htorch.core.mark_step() return final_hidden_states.view(-1, D)