From d5450b47643a45750c67608208d1e0ea40c7b7ec Mon Sep 17 00:00:00 2001
From: Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>
Date: Fri, 10 Jan 2025 01:26:36 -0800
Subject: [PATCH] PR #19067: [XLA:CPU][oneDNN] Move simplification pass before
 oneDNN pass

Imported from GitHub PR https://github.com/openxla/xla/pull/19067

This PR moves the simplification pass before oneDNN rewriter pass which simplifies the pattern matching for quantization support by getting rid of redundant copy ops.
Copybara import of the project:

--
57f2f3b3e5a850ff264450af5a8bc796062cc8c6 by Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>:

Move simplification pass before oneDNN pass

--
5248e332594414e71533154a63ea03145f533e4a by Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>:

Added a unit test

Merging this change closes #19067

COPYBARA_INTEGRATE_REVIEW=https://github.com/openxla/xla/pull/19067 from Intel-tensorflow:mabuzain/reorder-passes 5248e332594414e71533154a63ea03145f533e4a
PiperOrigin-RevId: 713956033
---
 xla/service/cpu/cpu_compiler.cc               | 48 +++++++++----------
 .../cpu/tests/onednn_convolution_test.cc      | 31 ++++++++++--
 2 files changed, 52 insertions(+), 27 deletions(-)
diff --git a/xla/service/cpu/cpu_compiler.cc b/xla/service/cpu/cpu_compiler.cc
index 5c28de6021def..f7546234d447f 100644
--- a/xla/service/cpu/cpu_compiler.cc
+++ b/xla/service/cpu/cpu_compiler.cc
@@ -791,6 +791,30 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   pipeline.AddPass<ReshapeDecomposer>();
 
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  // Run this to a fixed point.
+  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+       "simplification after layout assignment"),
+   this] {
+    AddHloVerifier(
+        &pipeline,
+        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
+            LayoutAssignment::InstructionCanChangeLayout),
+        /*debug_only=*/true);
+    AlgebraicSimplifierOptions options;
+    options.set_is_layout_sensitive(true);
+    options.set_supports_non_canonical_dots(false);
+    options.set_enable_dot_strength_reduction(false);
+    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
+    // other platforms do, so it should be changed.
+    options.set_minmax_propagate_nan(false);
+    options.set_executing_on_cpu(true);
+    pipeline.AddPass<AlgebraicSimplifier>(options);
+    pipeline.AddPass<HloDCE>();
+    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  }();
+
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
           ? module->config().intra_op_parallelism_threads()
@@ -822,30 +846,6 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // Add a fusion pass now that layout assignment is done.
   pipeline.AddPass<CpuInstructionFusion>();
 
-  // The LayoutAssignment pass may leave behind kCopy instructions which are
-  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-  // Run this to a fixed point.
-  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-       "simplification after layout assignment"),
-   this] {
-    AddHloVerifier(
-        &pipeline,
-        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
-            LayoutAssignment::InstructionCanChangeLayout),
-        /*debug_only=*/true);
-    AlgebraicSimplifierOptions options;
-    options.set_is_layout_sensitive(true);
-    options.set_supports_non_canonical_dots(false);
-    options.set_enable_dot_strength_reduction(false);
-    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
-    // other platforms do, so it should be changed.
-    options.set_minmax_propagate_nan(false);
-    options.set_executing_on_cpu(true);
-    pipeline.AddPass<AlgebraicSimplifier>(options);
-    pipeline.AddPass<HloDCE>();
-    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
-  }();
-
   // Outline ops in the entry computation into calls to subcomputations.
   if (!is_aot_compile) {
     // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
diff --git a/xla/service/cpu/tests/onednn_convolution_test.cc b/xla/service/cpu/tests/onednn_convolution_test.cc
index c94ada9dda190..4c011af8eabcb 100644
--- a/xla/service/cpu/tests/onednn_convolution_test.cc
+++ b/xla/service/cpu/tests/onednn_convolution_test.cc
@@ -141,13 +141,18 @@ class ConvolutionTest : public HloTestBase,
 
   void RunCompareAndMatchOptimizedHlo(
       const absl::string_view outline,
-      const std::vector<absl::string_view> fused_ops) {
+      const std::vector<absl::string_view> fused_ops,
+      const absl::string_view custom_match = "") {
     const std::string convolution_module_str = absl::StrReplaceAll(
         outline,
         {{"$dtype", dtypeString_}, {"$pdtype", PromotedDtypeToString()}});
     EXPECT_TRUE(RunAndCompare(convolution_module_str, ErrorSpec{atol_, rtol_}));
-    MatchOptimizedHlo(convolution_module_str,
-                      ConvStringWithOptimizations(fused_ops));
+    if (custom_match.empty()) {
+      MatchOptimizedHlo(convolution_module_str,
+                        ConvStringWithOptimizations(fused_ops));
+    } else {
+      MatchOptimizedHlo(convolution_module_str, custom_match);
+    }
   }
 };
 
@@ -593,6 +598,26 @@ TEST_P(ConvolutionTest, Conv2DWithBiasAndGeluExactPattern2Test) {
   RunCompareAndMatchOptimizedHlo(outline, {"BIAS", "GELU_ERF"});
 }
 
+TEST_P(ConvolutionTest, TransposeSimplifiedToBitcast) {
+  const char* outline = R"(
+  HloModule convolution.test.with.transpose
+
+  ENTRY convolution.test.with.transpose {
+    param_inp = $dtype[1,3,224,224] parameter(0)
+    transpose = $dtype[1,224,224,3] transpose(param_inp), dimensions={0,2,3,1}
+    param_wei = $dtype[64,3,7,7] parameter(1)
+    transpose.1 = $dtype[7,7,3,64] transpose(param_wei), dimensions={2,3,1,0}
+    ROOT convolution = $dtype[1,112,112,64] convolution(transpose, transpose.1),
+          window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f
+  })";
+
+  constexpr static const char* kBitcastCopyStr = R"(
+    ; CHECK: bitcast
+    ; CHECK: copy
+    ; CHECK: custom_call_target="__onednn$convolution")";
+  RunCompareAndMatchOptimizedHlo(outline, {}, kBitcastCopyStr);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     OneDnnConvolutionTestSuite, ConvolutionTest,
     ::testing::Values(F32, BF16, F16),