Skip to content

Commit

Permalink
feat(fuzzer): Allow bucket columns to overlap as sort columns in writ…
Browse files Browse the repository at this point in the history
…er fuzzer (#12007)

Summary:
Allow bucket columns to overlap as sort columns by using some of the bucket columns as sort columns and generating the rest of the sort columns.


Differential Revision: D67775105

Pulled By: HeidiHan0000
  • Loading branch information
HeidiHan0000 authored and facebook-github-bot committed Jan 6, 2025
1 parent 6171b52 commit fe7d648
Showing 1 changed file with 57 additions and 6 deletions.
63 changes: 57 additions & 6 deletions velox/exec/fuzzer/WriterFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <boost/random/uniform_int_distribution.hpp>

#include <re2/re2.h>
#include <algorithm>
#include <unordered_set>
#include "velox/common/base/Fs.h"
#include "velox/common/encode/Base64.h"
Expand Down Expand Up @@ -114,7 +115,18 @@ class WriterFuzzer {
const std::vector<TypePtr>& dataTypes,
int32_t maxDepth,
std::vector<std::string>& names,
std::vector<TypePtr>& types);
std::vector<TypePtr>& types,
int32_t offset = 0);

// Generates at least one and up to maxNumColumns columns
// with a random number of those columns overlapping as bucket by columns.
// Returns the name of the overlapping/generated columns
// and the sort column offset due to overlapping with bucket columns.
std::tuple<std::vector<std::string>, int> generateSortColumns(
int32_t maxNumColumns,
std::vector<std::string>& names,
std::vector<TypePtr>& types,
const std::vector<std::string>& bucketColumns);

// Generates input data for table write.
std::vector<RowVectorPtr> generateInputData(
Expand Down Expand Up @@ -342,12 +354,12 @@ void WriterFuzzer::go() {
bucketCount =
boost::random::uniform_int_distribution<int32_t>(1, 3)(rng_);

// TODO: sort columns can overlap as bucket columns
// 50% of times test ordered write.
if (vectorFuzzer_.coinToss(0.5)) {
sortColumnOffset = names.size();
auto sortColumns = generateColumns(
3, "s", kSupportedSortColumnTypes_, 1, names, types);
auto [sortColumns, offset] =
generateSortColumns(3, names, types, bucketColumns);
sortColumnOffset -= offset;
sortBy.reserve(sortColumns.size());
for (const auto& sortByColumn : sortColumns) {
sortBy.push_back(std::make_shared<const HiveSortingColumn>(
Expand Down Expand Up @@ -392,11 +404,12 @@ std::vector<std::string> WriterFuzzer::generateColumns(
const std::vector<TypePtr>& dataTypes,
int32_t maxDepth,
std::vector<std::string>& names,
std::vector<TypePtr>& types) {
std::vector<TypePtr>& types,
const int32_t offset) {
const auto numColumns =
boost::random::uniform_int_distribution<uint32_t>(1, maxNumColumns)(rng_);
std::vector<std::string> columns;
for (auto i = 0; i < numColumns; ++i) {
for (auto i = offset; i < numColumns; ++i) {
columns.push_back(fmt::format("{}{}", prefix, i));

// Pick random, possibly complex, type.
Expand All @@ -406,6 +419,44 @@ std::vector<std::string> WriterFuzzer::generateColumns(
return columns;
}

std::tuple<std::vector<std::string>, int> WriterFuzzer::generateSortColumns(
int32_t maxNumColumns,
std::vector<std::string>& names,
std::vector<TypePtr>& types,
const std::vector<std::string>& bucketColumns) {
// A random number of sort columns will overlap as bucket columns, which are
// already generated
const auto maxOverlapColumns = std::min<int32_t>(
maxNumColumns, static_cast<int32_t>(bucketColumns.size()));
const auto numOverlapColumns =
static_cast<int32_t>(boost::random::uniform_int_distribution<uint32_t>(
0, maxOverlapColumns)(rng_));

auto overlapOffset = bucketColumns.size() - numOverlapColumns;
std::vector<std::string> columns;
for (auto i = 0; i < numOverlapColumns; ++i) {
columns.push_back(bucketColumns.at(overlapOffset + i));
}

// Remaining columns which do not overlap as bucket by columns are added as
// new columns with prefix "s"
if (auto remainingColumns = maxNumColumns - numOverlapColumns;
remainingColumns > 0) {
auto nonOverlapColumns = generateColumns(
remainingColumns,
"s",
kSupportedSortColumnTypes_,
1,
names,
types,
numOverlapColumns);
columns.insert(
columns.end(), nonOverlapColumns.begin(), nonOverlapColumns.end());
}

return {columns, numOverlapColumns};
}

std::vector<RowVectorPtr> WriterFuzzer::generateInputData(
std::vector<std::string> names,
std::vector<TypePtr> types,
Expand Down

0 comments on commit fe7d648

Please sign in to comment.