PR #19067: [XLA:CPU][oneDNN] Move simplification pass before oneDNN pass

Imported from GitHub PR #19067 This PR moves the simplification pass before oneDNN rewriter pass which simplifies the pattern matching for quantization support by getting rid of redundant copy ops. Copybara import of the project: -- 57f2f3b by Mahmoud Abuzaina <[email protected]>: Move simplification pass before oneDNN pass -- 5248e33 by Mahmoud Abuzaina <[email protected]>: Added a unit test Merging this change closes #19067 COPYBARA_INTEGRATE_REVIEW=#19067 from Intel-tensorflow:mabuzain/reorder-passes 5248e33 PiperOrigin-RevId: 713956033
openxla · Jan 10, 2025 · d5450b4 · d5450b4
1 parent e6a6acf
commit d5450b4
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 27 deletions.
diff --git a/xla/service/cpu/cpu_compiler.cc b/xla/service/cpu/cpu_compiler.cc
@@ -791,6 +791,30 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   pipeline.AddPass<ReshapeDecomposer>();
 
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  // Run this to a fixed point.
+  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+       "simplification after layout assignment"),
+   this] {
+    AddHloVerifier(
+        &pipeline,
+        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
+            LayoutAssignment::InstructionCanChangeLayout),
+        /*debug_only=*/true);
+    AlgebraicSimplifierOptions options;
+    options.set_is_layout_sensitive(true);
+    options.set_supports_non_canonical_dots(false);
+    options.set_enable_dot_strength_reduction(false);
+    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
+    // other platforms do, so it should be changed.
+    options.set_minmax_propagate_nan(false);
+    options.set_executing_on_cpu(true);
+    pipeline.AddPass<AlgebraicSimplifier>(options);
+    pipeline.AddPass<HloDCE>();
+    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  }();
+
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
           ? module->config().intra_op_parallelism_threads()
@@ -822,30 +846,6 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // Add a fusion pass now that layout assignment is done.
   pipeline.AddPass<CpuInstructionFusion>();
 
-  // The LayoutAssignment pass may leave behind kCopy instructions which are
-  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-  // Run this to a fixed point.
-  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-       "simplification after layout assignment"),
-   this] {
-    AddHloVerifier(
-        &pipeline,
-        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
-            LayoutAssignment::InstructionCanChangeLayout),
-        /*debug_only=*/true);
-    AlgebraicSimplifierOptions options;
-    options.set_is_layout_sensitive(true);
-    options.set_supports_non_canonical_dots(false);
-    options.set_enable_dot_strength_reduction(false);
-    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
-    // other platforms do, so it should be changed.
-    options.set_minmax_propagate_nan(false);
-    options.set_executing_on_cpu(true);
-    pipeline.AddPass<AlgebraicSimplifier>(options);
-    pipeline.AddPass<HloDCE>();
-    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
-  }();
-
   // Outline ops in the entry computation into calls to subcomputations.
   if (!is_aot_compile) {
     // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.

diff --git a/xla/service/cpu/tests/onednn_convolution_test.cc b/xla/service/cpu/tests/onednn_convolution_test.cc
@@ -141,13 +141,18 @@ class ConvolutionTest : public HloTestBase,
 
   void RunCompareAndMatchOptimizedHlo(
       const absl::string_view outline,
-      const std::vector<absl::string_view> fused_ops) {
+      const std::vector<absl::string_view> fused_ops,
+      const absl::string_view custom_match = "") {
     const std::string convolution_module_str = absl::StrReplaceAll(
         outline,
         {{"$dtype", dtypeString_}, {"$pdtype", PromotedDtypeToString()}});
     EXPECT_TRUE(RunAndCompare(convolution_module_str, ErrorSpec{atol_, rtol_}));
-    MatchOptimizedHlo(convolution_module_str,
-                      ConvStringWithOptimizations(fused_ops));
+    if (custom_match.empty()) {
+      MatchOptimizedHlo(convolution_module_str,
+                        ConvStringWithOptimizations(fused_ops));
+    } else {
+      MatchOptimizedHlo(convolution_module_str, custom_match);
+    }
   }
 };
 
@@ -593,6 +598,26 @@ TEST_P(ConvolutionTest, Conv2DWithBiasAndGeluExactPattern2Test) {
   RunCompareAndMatchOptimizedHlo(outline, {"BIAS", "GELU_ERF"});
 }
 
+TEST_P(ConvolutionTest, TransposeSimplifiedToBitcast) {
+  const char* outline = R"(
+  HloModule convolution.test.with.transpose
+
+  ENTRY convolution.test.with.transpose {
+    param_inp = $dtype[1,3,224,224] parameter(0)
+    transpose = $dtype[1,224,224,3] transpose(param_inp), dimensions={0,2,3,1}
+    param_wei = $dtype[64,3,7,7] parameter(1)
+    transpose.1 = $dtype[7,7,3,64] transpose(param_wei), dimensions={2,3,1,0}
+    ROOT convolution = $dtype[1,112,112,64] convolution(transpose, transpose.1),
+          window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f
+  })";
+
+  constexpr static const char* kBitcastCopyStr = R"(
+    ; CHECK: bitcast
+    ; CHECK: copy
+    ; CHECK: custom_call_target="__onednn$convolution")";
+  RunCompareAndMatchOptimizedHlo(outline, {}, kBitcastCopyStr);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     OneDnnConvolutionTestSuite, ConvolutionTest,
     ::testing::Values(F32, BF16, F16),