diff --git a/dolomite_engine/hf_models/models/ladder_residual_TP/layer.py b/dolomite_engine/hf_models/models/ladder_residual_TP/layer.py index 868cf45f..f90e9273 100644 --- a/dolomite_engine/hf_models/models/ladder_residual_TP/layer.py +++ b/dolomite_engine/hf_models/models/ladder_residual_TP/layer.py @@ -58,9 +58,6 @@ def forward( max_seqlen=max_seqlen, ) - if self.m_residual is not None: - current_attention_out = current_attention_out * self.m_residual - if current_mlp_out is not None: residual = residual + current_mlp_out @@ -68,7 +65,4 @@ def forward( current_mlp_out = rmsnorm_cute_forward(residual, ln_2_weight, self.ln_2.eps, self.sequence_parallel) current_mlp_out = self.mlp_block(current_mlp_out) - if self.m_residual is not None: - current_mlp_out = current_mlp_out * self.m_residual - return current_attention_out, current_mlp_out, residual