Add fuzzer for RowNumber operator (#9524)

Summary: The RowNumberFuzzer is a testing tool that automatically generates equivalent query plans and then execute these plans to validate the consistency of the results. It works as follows: 1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can have a variety of encodings and data layouts to ensure thorough testing. 2. Plan Generation: Generate two equivalent query plans, one is row-number over ValuesNode and the other is over TableScanNode. 3. Query Execution: Executes those equivalent query plans using the generated data and asserts that the results are consistent across different plans. 4. Iteration: This process is repeated multiple times to ensure reliability and robustness. Pull Request resolved: #9524 Reviewed By: kevinwilfong Differential Revision: D56767074 Pulled By: xiaoxmeng fbshipit-source-id: 644b9a341e5273cb37efbdf0bd8254edcc54c974
facebookincubator · May 3, 2024 · 38abde9 · 38abde9
1 parent f54787b
commit 38abde9
Show file tree

Hide file tree

Showing 12 changed files with 891 additions and 7 deletions.
diff --git a/velox/docs/develop/testing/join-fuzzer.rst b/velox/docs/develop/testing/join-fuzzer.rst
@@ -42,7 +42,7 @@ Use velox_join_fuzzer_test binary to run join fuzzer:
 
     velox/exec/tests/velox_join_fuzzer_test
 
-By default, the fuzzer will go through 10 interations. Use --steps
+By default, the fuzzer will go through 10 iterations. Use --steps
 or --duration-sec flag to run fuzzer for longer. Use --seed to
 reproduce fuzzer failures.
 

diff --git a/velox/docs/develop/testing/row-number-fuzzer.rst b/velox/docs/develop/testing/row-number-fuzzer.rst
@@ -0,0 +1,55 @@
+================
+RowNumber Fuzzer
+================
+
+The RowNumberFuzzer is a testing tool that automatically generate equivalent query plans and then executes these plans
+to validate the consistency of the results. It works as follows:
+
+1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can
+   have a variety of encodings and data layouts to ensure thorough testing.
+2. Plan Generation: Generate two equivalent query plans, one is row-number over ValuesNode as the base plan.
+   and the other is over TableScanNode as the alter plan.
+3. Query Execution: Executes those equivalent query plans using the generated data and asserts that the results are
+   consistent across different plans.
+  i. Execute the base plan, compare the result with the reference (DuckDB or Presto) and use it as the expected result.
+  #. Execute the alter plan multiple times with and without spill, and compare each result with the
+     expected result.
+4. Iteration: This process is repeated multiple times to ensure reliability and robustness.
+
+How to run
+----------
+
+Use velox_row_number_fuzzer_test binary to run rowNumber fuzzer:
+
+::
+
+    velox/exec/tests/velox_row_number_fuzzer_test --seed 123 --duration_sec 60
+
+By default, the fuzzer will go through 10 iterations. Use --steps
+or --duration-sec flag to run fuzzer for longer. Use --seed to
+reproduce fuzzer failures.
+
+Here is a full list of supported command line arguments.
+
+* ``–-steps``: How many iterations to run. Each iteration generates and
+  evaluates one expression or aggregation. Default is 10.
+
+* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps``
+  and ``-–duration_sec`` are specified, –duration_sec takes precedence.
+
+* ``–-seed``: The seed to generate random expressions and input vectors with.
+
+* ``–-v=1``: Verbose logging (from `Google Logging Library <https://github.com/google/glog#setting-flags>`_).
+
+* ``–-batch_size``: The size of input vectors to generate. Default is 100.
+
+* ``--num_batches``: The number of input vectors of size `--batch_size` to
+  generate. Default is 5.
+
+* ``--enable_spill``: Whether to test with spilling or not. Default is true.
+
+* ``--presto_url`` The PrestoQueryRunner url along with its port number.
+
+* ``--req_timeout_ms`` Timeout in milliseconds of an HTTP request to the PrestoQueryRunner.
+
+If running from CLion IDE, add ``--logtostderr=1`` to see the full output.
diff --git a/velox/exec/fuzzer/CMakeLists.txt b/velox/exec/fuzzer/CMakeLists.txt
@@ -57,3 +57,9 @@ target_link_libraries(
   velox_expression_test_utility
   velox_aggregation_fuzzer_base
   velox_temp_path)
+
+add_library(velox_row_number_fuzzer RowNumberFuzzer.cpp)
+
+target_link_libraries(
+  velox_row_number_fuzzer velox_fuzzer_util velox_type velox_vector_fuzzer
+  velox_exec_test_lib velox_expression_test_utility)
diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp
@@ -133,21 +133,26 @@ std::optional<std::string> DuckQueryRunner::toSql(
     }
   }
 
-  if (auto projectNode =
+  if (const auto projectNode =
           std::dynamic_pointer_cast<const core::ProjectNode>(plan)) {
     return toSql(projectNode);
   }
 
-  if (auto windowNode =
+  if (const auto windowNode =
           std::dynamic_pointer_cast<const core::WindowNode>(plan)) {
     return toSql(windowNode);
   }
 
-  if (auto aggregationNode =
+  if (const auto aggregationNode =
           std::dynamic_pointer_cast<const core::AggregationNode>(plan)) {
     return toSql(aggregationNode);
   }
 
+  if (const auto rowNumberNode =
+          std::dynamic_pointer_cast<const core::RowNumberNode>(plan)) {
+    return toSql(rowNumberNode);
+  }
+
   VELOX_NYI();
 }
 
@@ -297,4 +302,31 @@ std::optional<std::string> DuckQueryRunner::toSql(
 
   return sql.str();
 }
+
+std::optional<std::string> DuckQueryRunner::toSql(
+    const std::shared_ptr<const core::RowNumberNode>& rowNumberNode) {
+  std::stringstream sql;
+  sql << "SELECT ";
+
+  const auto& inputType = rowNumberNode->sources()[0]->outputType();
+  for (auto i = 0; i < inputType->size(); ++i) {
+    appendComma(i, sql);
+    sql << inputType->nameOf(i);
+  }
+
+  sql << ", row_number() OVER (";
+
+  const auto& partitionKeys = rowNumberNode->partitionKeys();
+  if (!partitionKeys.empty()) {
+    sql << "partition by ";
+    for (auto i = 0; i < partitionKeys.size(); ++i) {
+      appendComma(i, sql);
+      sql << partitionKeys[i]->name();
+    }
+  }
+
+  sql << ") as row_number FROM tmp";
+
+  return sql.str();
+}
 } // namespace facebook::velox::exec::test
diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h
@@ -49,6 +49,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
   std::optional<std::string> toSql(
       const std::shared_ptr<const core::ProjectNode>& projectNode);
 
+  std::optional<std::string> toSql(
+      const std::shared_ptr<const core::RowNumberNode>& rowNumberNode);
+
   std::unordered_set<std::string> aggregateFunctionNames_;
 };
 

diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp
@@ -159,21 +159,26 @@ PrestoQueryRunner::PrestoQueryRunner(
 
 std::optional<std::string> PrestoQueryRunner::toSql(
     const core::PlanNodePtr& plan) {
-  if (auto projectNode =
+  if (const auto projectNode =
           std::dynamic_pointer_cast<const core::ProjectNode>(plan)) {
     return toSql(projectNode);
   }
 
-  if (auto windowNode =
+  if (const auto windowNode =
           std::dynamic_pointer_cast<const core::WindowNode>(plan)) {
     return toSql(windowNode);
   }
 
-  if (auto aggregationNode =
+  if (const auto aggregationNode =
           std::dynamic_pointer_cast<const core::AggregationNode>(plan)) {
     return toSql(aggregationNode);
   }
 
+  if (const auto rowNumberNode =
+          std::dynamic_pointer_cast<const core::RowNumberNode>(plan)) {
+    return toSql(rowNumberNode);
+  }
+
   VELOX_NYI();
 }
 
@@ -500,6 +505,37 @@ std::optional<std::string> PrestoQueryRunner::toSql(
   return sql.str();
 }
 
+std::optional<std::string> PrestoQueryRunner::toSql(
+    const std::shared_ptr<const core::RowNumberNode>& rowNumberNode) {
+  if (!isSupportedDwrfType(rowNumberNode->sources()[0]->outputType())) {
+    return std::nullopt;
+  }
+
+  std::stringstream sql;
+  sql << "SELECT ";
+
+  const auto& inputType = rowNumberNode->sources()[0]->outputType();
+  for (auto i = 0; i < inputType->size(); ++i) {
+    appendComma(i, sql);
+    sql << inputType->nameOf(i);
+  }
+
+  sql << ", row_number() OVER (";
+
+  const auto& partitionKeys = rowNumberNode->partitionKeys();
+  if (!partitionKeys.empty()) {
+    sql << "partition by ";
+    for (auto i = 0; i < partitionKeys.size(); ++i) {
+      appendComma(i, sql);
+      sql << partitionKeys[i]->name();
+    }
+  }
+
+  sql << ") as row_number FROM tmp";
+
+  return sql.str();
+}
+
 std::multiset<std::vector<variant>> PrestoQueryRunner::execute(
     const std::string& sql,
     const std::vector<RowVectorPtr>& input,

diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h
@@ -86,6 +86,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {
   std::optional<std::string> toSql(
       const std::shared_ptr<const velox::core::ProjectNode>& projectNode);
 
+  std::optional<std::string> toSql(
+      const std::shared_ptr<const velox::core::RowNumberNode>& rowNumberNode);
+
   std::string startQuery(const std::string& sql);
 
   std::string fetchNext(const std::string& nextUri);