PaddlePaddle · phlrain · Jan 26, 2025
diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
@@ -1264,7 +1264,9 @@ Tensor elu_decomp(const Tensor& x, const float alpha) {
     zero = full<T>(x_cast.shape(), 0, x_cast.type(), x_cast.place());
     tmp_res = alpha * (exp<T>(x_cast) - 1);
   }
-  auto ans = where<T>(x_cast > zero, x_cast, tmp_res);
+  auto cond = cast<T>(x_cast > zero, x_cast.dtype());
+  auto ans =
+      cond * x_cast + (full_scalar<T>(1.0, x_cast.dtype()) - cond) * tmp_res;
   return ConvertToOrig<T>(ans, x.dtype());
 }
 

diff --git a/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h b/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
@@ -1857,7 +1857,6 @@ void hardswish_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
     t2 = cast<T>(t2, x.dtype());
 
     auto res = out_grad * (t1 * t2 * (x / offset + factor) + one - t2);
-    // auto res = out_grad * (t1 * t2 * (x / offset + factor) );
     set_output<T>(res, x_grad);
   }
 }
@@ -1870,7 +1869,10 @@ void leaky_relu_grad(const Tensor& out,
   if (x_grad) {
     auto zero = full_scalar<T>(0.0, out.dtype());
     auto condition = greater_than<T>(out, zero);
-    auto res = where<T>(condition, out_grad, out_grad * negative_slope);
+    auto cond_cast = cast<T>(t1, out_grad.dtype());
+    auto res = cond_cast * out_grad +
+               (1 - cond_cast) * out_grad *
+                   full_scalar<T>(negative_slope, out_grad.type());
     set_output<T>(res, x_grad);
   }
 }