From fe7d648fd0919c9caa308650be7e9ab4b285d838 Mon Sep 17 00:00:00 2001 From: Heidi Han Date: Sun, 5 Jan 2025 21:51:33 -0800 Subject: [PATCH] feat(fuzzer): Allow bucket columns to overlap as sort columns in writer fuzzer (#12007) Summary: Allow bucket columns to overlap as sort columns by using some of the bucket columns as sort columns and generating the rest of the sort columns. Differential Revision: D67775105 Pulled By: HeidiHan0000 --- velox/exec/fuzzer/WriterFuzzer.cpp | 63 +++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/velox/exec/fuzzer/WriterFuzzer.cpp b/velox/exec/fuzzer/WriterFuzzer.cpp index d3d60e49aaf2..ca4c4a55192c 100644 --- a/velox/exec/fuzzer/WriterFuzzer.cpp +++ b/velox/exec/fuzzer/WriterFuzzer.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "velox/common/base/Fs.h" #include "velox/common/encode/Base64.h" @@ -114,7 +115,18 @@ class WriterFuzzer { const std::vector& dataTypes, int32_t maxDepth, std::vector& names, - std::vector& types); + std::vector& types, + int32_t offset = 0); + + // Generates at least one and up to maxNumColumns columns + // with a random number of those columns overlapping as bucket by columns. + // Returns the name of the overlapping/generated columns + // and the sort column offset due to overlapping with bucket columns. + std::tuple, int> generateSortColumns( + int32_t maxNumColumns, + std::vector& names, + std::vector& types, + const std::vector& bucketColumns); // Generates input data for table write. std::vector generateInputData( @@ -342,12 +354,12 @@ void WriterFuzzer::go() { bucketCount = boost::random::uniform_int_distribution(1, 3)(rng_); - // TODO: sort columns can overlap as bucket columns // 50% of times test ordered write. if (vectorFuzzer_.coinToss(0.5)) { sortColumnOffset = names.size(); - auto sortColumns = generateColumns( - 3, "s", kSupportedSortColumnTypes_, 1, names, types); + auto [sortColumns, offset] = + generateSortColumns(3, names, types, bucketColumns); + sortColumnOffset -= offset; sortBy.reserve(sortColumns.size()); for (const auto& sortByColumn : sortColumns) { sortBy.push_back(std::make_shared( @@ -392,11 +404,12 @@ std::vector WriterFuzzer::generateColumns( const std::vector& dataTypes, int32_t maxDepth, std::vector& names, - std::vector& types) { + std::vector& types, + const int32_t offset) { const auto numColumns = boost::random::uniform_int_distribution(1, maxNumColumns)(rng_); std::vector columns; - for (auto i = 0; i < numColumns; ++i) { + for (auto i = offset; i < numColumns; ++i) { columns.push_back(fmt::format("{}{}", prefix, i)); // Pick random, possibly complex, type. @@ -406,6 +419,44 @@ std::vector WriterFuzzer::generateColumns( return columns; } +std::tuple, int> WriterFuzzer::generateSortColumns( + int32_t maxNumColumns, + std::vector& names, + std::vector& types, + const std::vector& bucketColumns) { + // A random number of sort columns will overlap as bucket columns, which are + // already generated + const auto maxOverlapColumns = std::min( + maxNumColumns, static_cast(bucketColumns.size())); + const auto numOverlapColumns = + static_cast(boost::random::uniform_int_distribution( + 0, maxOverlapColumns)(rng_)); + + auto overlapOffset = bucketColumns.size() - numOverlapColumns; + std::vector columns; + for (auto i = 0; i < numOverlapColumns; ++i) { + columns.push_back(bucketColumns.at(overlapOffset + i)); + } + + // Remaining columns which do not overlap as bucket by columns are added as + // new columns with prefix "s" + if (auto remainingColumns = maxNumColumns - numOverlapColumns; + remainingColumns > 0) { + auto nonOverlapColumns = generateColumns( + remainingColumns, + "s", + kSupportedSortColumnTypes_, + 1, + names, + types, + numOverlapColumns); + columns.insert( + columns.end(), nonOverlapColumns.begin(), nonOverlapColumns.end()); + } + + return {columns, numOverlapColumns}; +} + std::vector WriterFuzzer::generateInputData( std::vector names, std::vector types,