Move Aggregation Function Data Specs into the Query Runners (#11192)

Summary: Pull Request resolved: #11192 Aggregation Function Data Specs declare when we don't want to generate NaN/Infinity for the arguments to a particular aggregate. Today these are hard coded in the AggregationFuzzer itself, but whether or not we support NaN/Infinity in fuzzing an aggregate function is more tightly tied to what we are comparing against (in what cases it's compatible with Velox) which Fuzzer we're running (as evidenced by the fact the Presto and Spark Aggregation Fuzzers are using the same lists currently). We are seeing cases where fuzzing with the Presto Query Runner is succeeding while fuzzing with the DuckDB Query Runner is failing. To fix this I've moved the Aggregation Function Data Specs map into the Query Runner, so that we can specify per Query Runner in what aggregation functions we don't want to produce Nan/Infinity for. Reviewed By: kewang1024 Differential Revision: D64011761 fbshipit-source-id: 5d516f61cebf1baedb6ef5ffb8a6821a2f2c4782
facebookincubator · Oct 10, 2024 · 5e4b35a · 5e4b35a
1 parent 7356542
commit 5e4b35a
Show file tree

Hide file tree

Showing 13 changed files with 75 additions and 61 deletions.
diff --git a/velox/exec/fuzzer/AggregationFuzzerOptions.h b/velox/exec/fuzzer/AggregationFuzzerOptions.h
@@ -50,8 +50,6 @@ struct AggregationFuzzerOptions {
 
   std::unordered_set<std::string> orderDependentFunctions;
 
-  std::unordered_map<std::string, DataSpec> functionDataSpec;
-
   /// Timestamp precision to use when generating inputs of type TIMESTAMP.
   VectorFuzzer::Options::TimestampPrecision timestampPrecision{
       VectorFuzzer::Options::TimestampPrecision::kMilliSeconds};

diff --git a/velox/exec/fuzzer/AggregationFuzzerRunner.h b/velox/exec/fuzzer/AggregationFuzzerRunner.h
@@ -107,12 +107,15 @@ class AggregationFuzzerRunner {
         registerVectorSerde();
     facebook::velox::filesystems::registerLocalFileSystem();
 
+    auto& aggregationFunctionDataSpecs =
+        referenceQueryRunner->aggregationFunctionDataSpecs();
+
     facebook::velox::exec::test::aggregateFuzzer(
         filteredSignatures,
         seed,
         options.customVerificationFunctions,
         options.customInputGenerators,
-        options.functionDataSpec,
+        aggregationFunctionDataSpecs,
         options.timestampPrecision,
         options.queryConfigs,
         options.hiveConfigs,

diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp
@@ -79,6 +79,29 @@ const std::vector<TypePtr>& DuckQueryRunner::supportedScalarTypes() const {
   return kScalarTypes;
 }
 
+const std::unordered_map<std::string, DataSpec>&
+DuckQueryRunner::aggregationFunctionDataSpecs() const {
+  // There are some functions for which DuckDB and Velox have inconsistent
+  // behavior with Nan and Infinity, so we exclude those.
+  static const std::unordered_map<std::string, DataSpec>
+      kAggregationFunctionDataSpecs{
+          {"covar_pop", DataSpec{true, false}},
+          {"covar_samp", DataSpec{true, false}},
+          {"histogram", DataSpec{false, false}},
+          {"regr_avgx", DataSpec{true, false}},
+          {"regr_avgy", DataSpec{true, false}},
+          {"regr_intercept", DataSpec{false, false}},
+          {"regr_r2", DataSpec{false, false}},
+          {"regr_replacement", DataSpec{false, false}},
+          {"regr_slope", DataSpec{false, false}},
+          {"regr_sxx", DataSpec{false, false}},
+          {"regr_sxy", DataSpec{false, false}},
+          {"regr_syy", DataSpec{false, false}},
+          {"var_pop", DataSpec{false, false}}};
+
+  return kAggregationFunctionDataSpecs;
+}
+
 std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
     const std::string& sql,
     const std::vector<RowVectorPtr>& input,

diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h
@@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
   /// TODO Investigate mismatches reported when comparing Varbinary.
   const std::vector<TypePtr>& supportedScalarTypes() const override;
 
+  const std::unordered_map<std::string, DataSpec>&
+  aggregationFunctionDataSpecs() const override;
+
   /// Specify names of aggregate function to exclude from the list of supported
   /// functions. Used to exclude functions that are non-determonistic, have bugs
   /// or whose semantics differ from Velox.

diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp
@@ -273,6 +273,29 @@ const std::vector<TypePtr>& PrestoQueryRunner::supportedScalarTypes() const {
   return kScalarTypes;
 }
 
+const std::unordered_map<std::string, DataSpec>&
+PrestoQueryRunner::aggregationFunctionDataSpecs() const {
+  // For some functions, velox supports NaN, Infinity better than presto query
+  // runner, which makes the comparison impossible.
+  // Add data constraint in vector fuzzer to enforce to not generate such data
+  // for those functions before they are fixed in presto query runner
+  static const std::unordered_map<std::string, DataSpec>
+      kAggregationFunctionDataSpecs{
+          {"regr_avgx", DataSpec{false, false}},
+          {"regr_avgy", DataSpec{false, false}},
+          {"regr_r2", DataSpec{false, false}},
+          {"regr_sxx", DataSpec{false, false}},
+          {"regr_syy", DataSpec{false, false}},
+          {"regr_sxy", DataSpec{false, false}},
+          {"regr_slope", DataSpec{false, false}},
+          {"regr_replacement", DataSpec{false, false}},
+          {"covar_pop", DataSpec{true, false}},
+          {"covar_samp", DataSpec{true, false}},
+      };
+
+  return kAggregationFunctionDataSpecs;
+}
+
 std::optional<std::string> PrestoQueryRunner::toSql(
     const std::shared_ptr<const core::AggregationNode>& aggregationNode) {
   // Assume plan is Aggregation over Values.

diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h
@@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {
 
   const std::vector<TypePtr>& supportedScalarTypes() const override;
 
+  const std::unordered_map<std::string, DataSpec>&
+  aggregationFunctionDataSpecs() const override;
+
   /// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
   /// Window with an optional Project on top.
   ///

diff --git a/velox/exec/fuzzer/ReferenceQueryRunner.h b/velox/exec/fuzzer/ReferenceQueryRunner.h
@@ -45,6 +45,9 @@ class ReferenceQueryRunner {
     return defaultScalarTypes();
   }
 
+  virtual const std::unordered_map<std::string, DataSpec>&
+  aggregationFunctionDataSpecs() const = 0;
+
   /// Converts Velox plan into SQL accepted by the reference database.
   /// @return std::nullopt if the plan uses features not supported by the
   /// reference database.

diff --git a/velox/exec/fuzzer/WindowFuzzerRunner.h b/velox/exec/fuzzer/WindowFuzzerRunner.h
@@ -79,14 +79,17 @@ class WindowFuzzerRunner {
         registerVectorSerde();
     facebook::velox::filesystems::registerLocalFileSystem();
 
+    auto& aggregationFunctionDataSpecs =
+        referenceQueryRunner->aggregationFunctionDataSpecs();
+
     facebook::velox::exec::test::windowFuzzer(
         filteredAggregationSignatures,
         filteredWindowSignatures,
         seed,
         options.customVerificationFunctions,
         options.customInputGenerators,
         options.orderDependentFunctions,
-        options.functionDataSpec,
+        aggregationFunctionDataSpecs,
         options.timestampPrecision,
         options.queryConfigs,
         options.hiveConfigs,

diff --git a/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp
@@ -184,24 +184,6 @@ int main(int argc, char** argv) {
           {"sum_data_size_for_stats", nullptr},
       };
 
-  using facebook::velox::DataSpec;
-  // For some functions, velox supports NaN, Infinity better than presto query
-  // runner, which makes the comparison impossible.
-  // Add data constraint in vector fuzzer to enforce to not generate such data
-  // for those functions before they are fixed in presto query runner
-  static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
-      {"regr_avgx", DataSpec{false, false}},
-      {"regr_avgy", DataSpec{false, false}},
-      {"regr_r2", DataSpec{false, false}},
-      {"regr_sxx", DataSpec{false, false}},
-      {"regr_syy", DataSpec{false, false}},
-      {"regr_sxy", DataSpec{false, false}},
-      {"regr_slope", DataSpec{false, false}},
-      {"regr_replacement", DataSpec{false, false}},
-      {"covar_pop", DataSpec{true, false}},
-      {"covar_samp", DataSpec{true, false}},
-  };
-
   using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
   using Options = facebook::velox::exec::test::AggregationFuzzerOptions;
 
@@ -213,9 +195,9 @@ int main(int argc, char** argv) {
       facebook::velox::exec::test::getCustomInputGenerators();
   options.timestampPrecision =
       facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
-  options.functionDataSpec = functionDataSpec;
   std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
       facebook::velox::memory::memoryManager()->addRootPool()};
+
   return Runner::run(
       initialSeed,
       setupReferenceQueryRunner(

diff --git a/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp
@@ -141,24 +141,6 @@ int main(int argc, char** argv) {
           {"sum_data_size_for_stats", nullptr},
       };
 
-  using facebook::velox::DataSpec;
-  // For some functions, velox supports NaN, Infinity better than presto query
-  // runner, which makes the comparison impossible.
-  // Add data spec in vector fuzzer to enforce to not generate such data
-  // for those functions before they are fixed in presto query runner
-  static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
-      {"regr_avgx", DataSpec{false, false}},
-      {"regr_avgy", DataSpec{false, false}},
-      {"regr_r2", DataSpec{false, false}},
-      {"regr_sxx", DataSpec{false, false}},
-      {"regr_syy", DataSpec{false, false}},
-      {"regr_sxy", DataSpec{false, false}},
-      {"regr_slope", DataSpec{false, false}},
-      {"regr_replacement", DataSpec{false, false}},
-      {"covar_pop", DataSpec{true, false}},
-      {"covar_samp", DataSpec{true, false}},
-  };
-
   static const std::unordered_set<std::string> orderDependentFunctions = {
       // Window functions.
       "first_value",
@@ -199,7 +181,6 @@ int main(int argc, char** argv) {
   options.orderDependentFunctions = orderDependentFunctions;
   options.timestampPrecision =
       facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
-  options.functionDataSpec = functionDataSpec;
   std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
       facebook::velox::memory::memoryManager()->addRootPool()};
   return Runner::run(

diff --git a/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp b/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp
@@ -111,24 +111,6 @@ int main(int argc, char** argv) {
        // formula. The results from the two methods are completely different.
        "kurtosis"});
 
-  using facebook::velox::DataSpec;
-  // For some functions, velox supports NaN, Infinity better than presto query
-  // runner, which makes the comparison impossible.
-  // Add data spec in vector fuzzer to enforce to not generate such data
-  // for those functions before they are fixed in presto query runner
-  static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
-      {"regr_avgx", DataSpec{false, false}},
-      {"regr_avgy", DataSpec{false, false}},
-      {"regr_r2", DataSpec{false, false}},
-      {"regr_sxx", DataSpec{false, false}},
-      {"regr_syy", DataSpec{false, false}},
-      {"regr_sxy", DataSpec{false, false}},
-      {"regr_slope", DataSpec{false, false}},
-      {"regr_replacement", DataSpec{false, false}},
-      {"covar_pop", DataSpec{true, false}},
-      {"covar_samp", DataSpec{true, false}},
-  };
-
   using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
   using Options = facebook::velox::exec::test::AggregationFuzzerOptions;
 
@@ -137,6 +119,5 @@ int main(int argc, char** argv) {
   options.skipFunctions = skipFunctions;
   options.customVerificationFunctions = customVerificationFunctions;
   options.orderableGroupKeys = true;
-  options.functionDataSpec = functionDataSpec;
   return Runner::run(initialSeed, std::move(duckQueryRunner), options);
 }
diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp
@@ -90,6 +90,14 @@ const std::vector<TypePtr>& SparkQueryRunner::supportedScalarTypes() const {
   return kScalarTypes;
 }
 
+const std::unordered_map<std::string, DataSpec>&
+SparkQueryRunner::aggregationFunctionDataSpecs() const {
+  static const std::unordered_map<std::string, DataSpec>
+      kAggregationFunctionDataSpecs{};
+
+  return kAggregationFunctionDataSpecs;
+}
+
 std::optional<std::string> SparkQueryRunner::toSql(
     const velox::core::PlanNodePtr& plan) {
   if (const auto aggregationNode =

diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.h b/velox/functions/sparksql/fuzzer/SparkQueryRunner.h
@@ -73,6 +73,9 @@ class SparkQueryRunner : public velox::exec::test::ReferenceQueryRunner {
 
   const std::vector<TypePtr>& supportedScalarTypes() const override;
 
+  const std::unordered_map<std::string, DataSpec>&
+  aggregationFunctionDataSpecs() const override;
+
   bool supportsVeloxVectorResults() const override {
     return true;
   }