From bc1b6f22920f527be91db728c4f15ff9d3acfeb8 Mon Sep 17 00:00:00 2001 From: Wei He Date: Mon, 18 Nov 2024 10:56:35 -0800 Subject: [PATCH] Constrained input generators for fuzzers (#11368) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11368 DO NOT REVIEW FOR NOW. This diff adds a prototype of the constrained input generator for fuzzers. These generators can be used to generate random data satisfying given constraints, needed for fuzzers for testing functions that have special requirement on input data, such as cdf functions, subscript functions, etc. Differential Revision: D65101030 --- velox/CMakeLists.txt | 1 + .../fuzzer_input_generator/CMakeLists.txt | 26 ++ .../ConstrainedGenerators.cpp | 206 ++++++++++ .../ConstrainedGenerators.h | 359 ++++++++++++++++++ .../ConstrainedVectorGenerator.cpp | 149 ++++++++ .../ConstrainedVectorGenerator.h | 38 ++ .../tests/CMakeLists.txt | 28 ++ .../tests/ConstrainedGeneratorsTest.cpp | 307 +++++++++++++++ velox/vector/fuzzer/CMakeLists.txt | 9 +- velox/vector/fuzzer/Utils.cpp | 82 +++- velox/vector/fuzzer/Utils.h | 157 +++++++- velox/vector/fuzzer/VectorFuzzer.cpp | 174 +-------- velox/vector/fuzzer/VectorFuzzer.h | 12 - 13 files changed, 1362 insertions(+), 186 deletions(-) create mode 100644 velox/experimental/fuzzer_input_generator/CMakeLists.txt create mode 100644 velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp create mode 100644 velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h create mode 100644 velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp create mode 100644 velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h create mode 100644 velox/experimental/fuzzer_input_generator/tests/CMakeLists.txt create mode 100644 velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index fe2d23371aa8..c992dc9eee80 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -25,6 +25,7 @@ add_subdirectory(flag_definitions) add_subdirectory(external/date) add_subdirectory(external/md5) add_subdirectory(external/hdfs) +add_subdirectory(experimental/fuzzer_input_generator) # # examples depend on expression diff --git a/velox/experimental/fuzzer_input_generator/CMakeLists.txt b/velox/experimental/fuzzer_input_generator/CMakeLists.txt new file mode 100644 index 000000000000..6343f0bb9961 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +velox_add_library(velox_fuzzer_constrained_input_generators + ConstrainedGenerators.cpp ConstrainedVectorGenerator.cpp) + +velox_link_libraries( + velox_fuzzer_constrained_input_generators + Folly::folly + velox_expression + velox_type + velox_vector_fuzzer_util) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp new file mode 100644 index 000000000000..c3629bfd0ca8 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +#include + +#include "velox/vector/fuzzer/Utils.h" + +namespace facebook::velox::fuzzer { + +// AbstractInputGenerator +AbstractInputGenerator::AbstractInputGenerator( + size_t seed, + const TypePtr& type, + std::unique_ptr&& next, + double nullRatio) + : type_{type}, next_{std::move(next)}, nullRatio_{nullRatio} { + rng_.seed(seed); +} + +// NotEqualConstrainedGenerator +variant NotEqualConstrainedGenerator::generate() { + variant value; + do { + value = next_->generate(); + } while (value == excludedValue_); + return value; +} + +// SetConstrainedGenerator +variant SetConstrainedGenerator::generate() { + const auto index = + boost::random::uniform_int_distribution(0, set_.size() - 1)(rng_); + return set_[index]; +} + +// JsonInputGenerator +folly::json::serialization_opts JsonInputGenerator::getSerializationOptions() { + folly::json::serialization_opts opts; + opts.allow_non_string_keys = true; + opts.allow_nan_inf = true; + if (makeRandomVariation_) { + opts.convert_int_keys = rand(rng_); + opts.pretty_formatting = rand(rng_); + opts.pretty_formatting_indent_width = rand(rng_, 0, 4); + opts.encode_non_ascii = rand(rng_); + opts.allow_trailing_comma = rand(rng_); + opts.sort_keys = rand(rng_); + opts.skip_invalid_utf8 = rand(rng_); + opts.parse_numbers_as_strings = rand(rng_); + } + return opts; +} + +variant JsonInputGenerator::generate() { + if (coinToss(rng_, nullRatio_)) { + return variant::null(type_->kind()); + } + + const auto object = objectGenerator_->generate(); + const folly::dynamic jsonObject = convertVariantToDynamic(object); + const auto jsonString = folly::json::serialize(jsonObject, opts_); + if (makeRandomVariation_ && coinToss(rng_, 0.5)) { + makeRandomVariation(jsonString); + } + return variant(jsonString); +} + +folly::dynamic JsonInputGenerator::convertVariantToDynamic( + const variant& object) { + if (object.isNull()) { + return folly::dynamic(); + } + + switch (object.kind()) { + case TypeKind::BOOLEAN: + return convertVariantToDynamicPrimitive(object); + case TypeKind::TINYINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::SMALLINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::INTEGER: + return convertVariantToDynamicPrimitive(object); + case TypeKind::BIGINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::REAL: + return convertVariantToDynamicPrimitive(object); + case TypeKind::DOUBLE: + return convertVariantToDynamicPrimitive(object); + case TypeKind::VARCHAR: + return convertVariantToDynamicPrimitive(object); + case TypeKind::VARBINARY: + return convertVariantToDynamicPrimitive(object); + case TypeKind::TIMESTAMP: + return convertVariantToDynamicPrimitive(object); + case TypeKind::HUGEINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::ARRAY: { + folly::dynamic array = folly::dynamic::array; + for (const auto& element : object.value()) { + array.push_back(convertVariantToDynamic(element)); + } + return array; + } + case TypeKind::MAP: { + folly::dynamic map = folly::dynamic::object; + for (const auto& [key, value] : object.value()) { + map[convertVariantToDynamic(key)] = convertVariantToDynamic(value); + } + return map; + } + case TypeKind::ROW: { + folly::dynamic array = folly::dynamic::array; + for (const auto& element : object.value()) { + array.push_back(convertVariantToDynamic(element)); + } + return array; + } + default: + VELOX_UNREACHABLE("Unsupported type"); + } +} + +std::vector getControlCharacters() { + static std::vector controlCharacters = { + "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", + "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", + "\x0E", "\x0F", "\x10", "\x11", "\x12", "\x13", "\x14", + "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", + "\x1C", "\x1D", "\x1E", "\x1F", "\x20", "\x7F", "\u0080", + "\u0081", "\u0082", "\u0083", "\u0084", "\u0085", "\u0086", "\u0087", + "\u0088", "\u0089", "\u008A", "\u008B", "\u008C", "\u008D", "\u008E", + "\u008F", "\u0090", "\u0091", "\u0092", "\u0093", "\u0094", "\u0095", + "\u0096", "\u0097", "\u0098", "\u0099", "\u009A", "\u009B", "\u009C", + "\u009D", "\u009E", "\u009F"}; + return controlCharacters; +}; + +void JsonInputGenerator::makeRandomVariation(std::string json) { + if (coinToss(rng_, 0.1)) { + const auto controlCharacters = getControlCharacters(); + const auto index = rand(rng_, 0, controlCharacters.size() - 1); + const auto& controlCharacter = controlCharacters[index]; + const auto indexToInsert = rand(rng_, 0, json.size()); + json.insert(indexToInsert, controlCharacter); + } else if (coinToss(rng_, 0.1)) { + const auto size = rand(rng_, 0, json.size()); + json.resize(size); + } +} + +// Utility functions +template +std::unique_ptr getRandomInputGeneratorPrimitive( + size_t seed, + const TypePtr& type, + double nullRatio) { + using T = typename TypeTraits::NativeType; + std::unique_ptr generator = + std::make_unique>(seed, type, nullRatio); + return generator; +} + +std::unique_ptr +getRandomInputGenerator(size_t seed, const TypePtr& type, double nullRatio) { + std::unique_ptr generator; + if (type->isPrimitiveType()) { + return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + getRandomInputGeneratorPrimitive, + false, + type->kind(), + seed, + type, + nullRatio); + } else if (type->isArray()) { + generator = std::make_unique>( + seed, type, nullRatio); + } else if (type->isMap()) { + generator = + std::make_unique>(seed, type, nullRatio); + + } else if (type->isRow()) { + generator = std::make_unique>( + seed, + type, + std::vector>{}, + nullRatio); + } + return generator; +} + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h new file mode 100644 index 000000000000..c829065b891c --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "folly/json.h" + +#include "velox/type/Type.h" +#include "velox/type/Variant.h" +#include "velox/vector/fuzzer/Utils.h" + +namespace facebook::velox::fuzzer { + +using facebook::velox::variant; + +class AbstractInputGenerator { + public: + AbstractInputGenerator( + size_t seed, + const TypePtr& type, + std::unique_ptr&& next, + double nullRatio); + + virtual ~AbstractInputGenerator() = default; + + virtual variant generate() = 0; + + TypePtr type() const { + return type_; + } + + protected: + FuzzerGenerator rng_; + + TypePtr type_; + + std::unique_ptr next_; + + double nullRatio_; +}; + +std::unique_ptr +getRandomInputGenerator(size_t seed, const TypePtr& type, double nullRatio); + +template +class RandomInputGenerator : public AbstractInputGenerator { + public: + RandomInputGenerator(size_t seed, const TypePtr& type, double nullRatio) + : AbstractInputGenerator(seed, type, nullptr, nullRatio) {} + + ~RandomInputGenerator() override = default; + + variant generate() override { + if (coinToss(rng_, nullRatio_)) { + return variant::null(type_->kind()); + } + + if (type_->isDate()) { + return variant(randDate(rng_)); + } + return variant(rand(rng_)); + } +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + double nullRatio, + size_t maxLength = 20, + const std::vector& encodings = + {UTF8CharList::ASCII, + UTF8CharList::UNICODE_CASE_SENSITIVE, + UTF8CharList::EXTENDED_UNICODE, + UTF8CharList::MATHEMATICAL_SYMBOLS}) + : AbstractInputGenerator(seed, type, nullptr, nullRatio), + maxLength_{maxLength}, + encodings_{encodings} {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + if (coinToss(rng_, nullRatio_)) { + return variant::null(type_->kind()); + } + + const auto length = rand(rng_, 0, maxLength_); + std::wstring_convert, char16_t> converter; + std::string buf; + return variant(randString(rng_, length, encodings_, buf, converter)); + } + + private: + const size_t maxLength_; + + std::vector encodings_; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + double nullRatio, + size_t maxLength = 10, + std::unique_ptr&& elementGenerator = nullptr, + std::optional containAtIndex = std::nullopt, + std::unique_ptr&& containGenerator = nullptr) + : AbstractInputGenerator(seed, type, nullptr, nullRatio), + maxLength_{maxLength}, + elementGenerator_{ + elementGenerator + ? std::move(elementGenerator) + : getRandomInputGenerator(seed, type->childAt(0), nullRatio)}, + containAtIndex_{containAtIndex}, + containGenerator_{std::move(containGenerator)} {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + if (coinToss(rng_, nullRatio_)) { + return variant::null(TypeKind::ARRAY); + } + + const auto length = rand(rng_, 0, maxLength_); + std::vector elements; + elements.reserve(length); + for (size_t i = 0; i < length; ++i) { + if UNLIKELY (containAtIndex_.has_value() && *containAtIndex_ == i) { + elements.push_back(containGenerator_->generate()); + } else { + elements.push_back(elementGenerator_->generate()); + } + } + return variant::array(elements); + } + + private: + const size_t maxLength_; + + std::unique_ptr elementGenerator_; + + std::optional containAtIndex_; + + std::unique_ptr containGenerator_; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + double nullRatio, + size_t maxLength = 10, + std::unique_ptr&& keyGenerator = nullptr, + std::unique_ptr&& valueGenerator = nullptr, + std::optional containAtIndex = std::nullopt, + std::unique_ptr&& containKeyGenerator = nullptr, + std::unique_ptr&& containValueGenerator = nullptr) + : AbstractInputGenerator(seed, type, nullptr, nullRatio), + maxLength_{maxLength}, + keyGenerator_{ + keyGenerator + ? std::move(keyGenerator) + : getRandomInputGenerator(seed, type->childAt(0), 0.0)}, + valueGenerator_{ + valueGenerator + ? std::move(valueGenerator) + : getRandomInputGenerator(seed, type->childAt(1), nullRatio)}, + containAtIndex_{containAtIndex}, + containKeyGenerator_{std::move(containKeyGenerator)}, + containValueGenerator_{std::move(containValueGenerator)} {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + if (coinToss(rng_, nullRatio_)) { + return variant::null(TypeKind::MAP); + } + + const auto length = rand(rng_, 0, maxLength_); + std::map map; + for (size_t i = 0; i < length; ++i) { + if UNLIKELY (containAtIndex_.has_value() && *containAtIndex_ == i) { + map.emplace( + containKeyGenerator_->generate(), + containValueGenerator_->generate()); + } else { + map.emplace(keyGenerator_->generate(), valueGenerator_->generate()); + } + } + return variant::map(map); + } + + private: + const size_t maxLength_; + + std::unique_ptr keyGenerator_; + + std::unique_ptr valueGenerator_; + + std::optional containAtIndex_; + + std::unique_ptr containKeyGenerator_; + + std::unique_ptr containValueGenerator_; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + std::vector> fieldGenerators, + double nullRatio) + : AbstractInputGenerator(seed, type, nullptr, nullRatio) { + const auto length = type->size(); + fieldGenerators_ = std::move(fieldGenerators); + for (size_t i = 0; i < length; ++i) { + if (fieldGenerators_.size() <= i) { + fieldGenerators_.push_back( + getRandomInputGenerator(seed, type->childAt(i), nullRatio)); + } else if (fieldGenerators_[i] == nullptr) { + fieldGenerators_[i] = + getRandomInputGenerator(seed, type->childAt(i), nullRatio); + } + } + } + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + if (coinToss(rng_, nullRatio_)) { + return variant::null(TypeKind::ROW); + } + + const auto length = type_->size(); + std::vector fields; + fields.reserve(length); + for (size_t i = 0; i < length; ++i) { + fields.push_back(fieldGenerators_[i]->generate()); + } + return variant::row(fields); + } + + private: + std::vector> fieldGenerators_; +}; + +class NotEqualConstrainedGenerator : public AbstractInputGenerator { + public: + // nullRatio doesn't affect the data generation because it is 'next' that + // generates data. + NotEqualConstrainedGenerator( + size_t seed, + const TypePtr& type, + const variant& excludedValue, + std::unique_ptr&& next) + : AbstractInputGenerator(seed, type, std::move(next), 0.0), + excludedValue_{excludedValue} {} + + ~NotEqualConstrainedGenerator() override = default; + + variant generate() override; + + private: + variant excludedValue_; +}; + +class SetConstrainedGenerator : public AbstractInputGenerator { + public: + // nullRatio doesn't affect the data generation because only variants in 'set' + // can be generated. + SetConstrainedGenerator( + size_t seed, + const TypePtr& type, + const std::vector& set) + : AbstractInputGenerator(seed, type, nullptr, 0.0), set_{set} {} + + ~SetConstrainedGenerator() override = default; + + variant generate() override; + + private: + std::vector set_; +}; + +class JsonInputGenerator : public AbstractInputGenerator { + public: + JsonInputGenerator( + size_t seed, + const TypePtr& type, + double nullRatio, + std::unique_ptr&& objectGenerator, + bool makeRandomVariation = false) + : AbstractInputGenerator(seed, type, nullptr, nullRatio), + objectGenerator_{std::move(objectGenerator)}, + makeRandomVariation_{makeRandomVariation}, + opts_{getSerializationOptions()} {} + + ~JsonInputGenerator() override = default; + + variant generate() override; + + const folly::json::serialization_opts& serializationOptions() const { + return opts_; + } + + private: + template + folly::dynamic convertVariantToDynamicPrimitive(const variant& v) { + using T = typename TypeTraits::DeepCopiedType; + VELOX_CHECK(v.isSet()); + const T value = v.value(); + return folly::dynamic(value); + } + + folly::dynamic convertVariantToDynamic(const variant& object); + + void makeRandomVariation(std::string json); + + folly::json::serialization_opts getSerializationOptions(); + + std::unique_ptr objectGenerator_; + + bool makeRandomVariation_; + + folly::json::serialization_opts opts_; +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp new file mode 100644 index 000000000000..20797769ea76 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h" + +#include "velox/expression/VectorWriters.h" + +namespace facebook::velox::fuzzer { + +using exec::GenericWriter; +using exec::VectorWriter; + +// static +VectorPtr ConstrainedVectorGenerator::generateConstant( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(customGenerator); + VELOX_CHECK(customGenerator->type()->isPrimitiveType()); + + const auto& type = customGenerator->type(); + const auto variant = customGenerator->generate(); + + return BaseVector::createConstant(type, variant, size, pool); +} + +template +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); + +template +void writeOne(const variant& value, GenericWriter& writer) { + using T = typename TypeTraits::NativeType; + writer.template castTo() = value.value(); +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + writer.template castTo() = value.value(); +} + +template <> +void writeOne( + const variant& value, + GenericWriter& writer) { + writer.template castTo() = value.value(); +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo>(); + const auto& elements = value.array(); + for (const auto& element : elements) { + if (element.isNull()) { + writerTyped.add_null(); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, element.kind(), element, writerTyped.add_item()); + } + } +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo>(); + const auto& map = value.map(); + for (const auto& pair : map) { + const auto& key = pair.first; + const auto& value = pair.second; + VELOX_CHECK(!key.isNull()); + if (value.isNull()) { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, key.kind(), key, writerTyped.add_null()); + } else { + auto writers = writerTyped.add_item(); + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, key.kind(), key, std::get<0>(writers)); + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, value.kind(), value, std::get<1>(writers)); + } + } +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo(); + const auto& elements = value.row(); + column_index_t i = 0; + for (const auto& element : elements) { + if (element.isNull()) { + writerTyped.set_null_at(i); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, element.kind(), element, writerTyped.get_writer_at(i)); + } + i++; + } +} + +// static +VectorPtr ConstrainedVectorGenerator::generateFlat( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(customGenerator); + + VectorPtr result; + const auto& type = customGenerator->type(); + BaseVector::ensureWritable(SelectivityVector(size), type, pool, result); + VectorWriter writer; + writer.init(*result); + + for (auto i = 0; i < size; ++i) { + writer.setOffset(i); + const auto variant = customGenerator->generate(); + if (variant.isNull()) { + writer.commitNull(); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, type->kind(), variant, writer.current()); + writer.commit(true); + } + } + return result; +} + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h new file mode 100644 index 000000000000..555b905f46b0 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +namespace facebook::velox::fuzzer { + +class ConstrainedVectorGenerator { + public: + ConstrainedVectorGenerator() = delete; + + static VectorPtr generateConstant( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool); + + static VectorPtr generateFlat( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool); +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/tests/CMakeLists.txt b/velox/experimental/fuzzer_input_generator/tests/CMakeLists.txt new file mode 100644 index 000000000000..3f781a64aaf3 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/tests/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_fuzzer_constrained_input_generators_test + ConstrainedGeneratorsTest.cpp) + +add_test( + NAME velox_fuzzer_constrained_input_generators_test + COMMAND velox_fuzzer_constrained_input_generators_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries( + velox_fuzzer_constrained_input_generators_test + velox_fuzzer_constrained_input_generators + velox_presto_types + velox_type + velox_vector_test_lib) diff --git a/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp b/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp new file mode 100644 index 000000000000..8250921f3d7c --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +#include + +#include "velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h" +#include "velox/functions/prestosql/types/JsonType.h" +#include "velox/type/Variant.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +namespace facebook::velox::fuzzer::test { + +class ConstrainedGeneratorsTest : public testing::Test, + public velox::test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + template + void testRandomPrimitive(const TypePtr& type, bool testNull) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + std::unique_ptr generator = + std::make_unique>( + 0, type, testNull ? 1.0 : 0.0); + auto value = generator->generate(); + if (testNull) { + EXPECT_FALSE(value.hasValue()); + } else { + EXPECT_TRUE(value.hasValue()); + } + EXPECT_EQ(value.kind(), KIND); + } + + template + void testRandomComplex(const TypePtr& type, bool testNull) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::unique_ptr generator = + std::make_unique>( + 0, type, testNull ? 1.0 : 0.0); + auto value = generator->generate(); + if (testNull) { + EXPECT_FALSE(value.hasValue()); + } else { + EXPECT_TRUE(value.hasValue()); + } + EXPECT_EQ(value.kind(), KIND); // TODO: check type recursive + } + + template + void testNotEqualPrimitive( + const TypePtr& type, + const TValue& excludedValue, + bool testNull) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + variant excludedVariant{excludedValue}; + std::unique_ptr generator = + std::make_unique( + 0, + type, + excludedVariant, + std::make_unique>( + 0, type, testNull ? 1.0 : 0.0)); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_EQ(value.kind(), KIND); + if (testNull) { + EXPECT_FALSE(value.hasValue()); + } else { + EXPECT_TRUE(value.hasValue()); + EXPECT_NE(value, excludedVariant); + } + } + } + + template + void testNotEqualComplex( + const TypePtr& type, + const variant& excludedVariant, + bool testNull) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::unique_ptr generator = + std::make_unique( + 0, + type, + excludedVariant, + std::make_unique>( + 0, type, testNull ? 1.0 : 0.0)); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_EQ(value.kind(), KIND); // todo: test kind recursively. + if (testNull) { + EXPECT_FALSE(value.hasValue()); + } else { + EXPECT_TRUE(value.hasValue()); + EXPECT_NE(value, excludedVariant); + } + } + } + + template + void testSetPrimitive(const TypePtr& type, const TSet& setOfRawValues) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + const uint32_t kIterations = 1000; + std::vector variants; + for (const auto& value : setOfRawValues) { + variants.push_back(variant{value}); + } + std::unique_ptr generator = + std::make_unique(0, type, variants); + + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); + EXPECT_NE(setOfRawValues.count(value.value()), 0); + } + } + + template + void testSetComplex( + const TypePtr& type, + const std::vector& variants) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::set setOfVariants{variants.begin(), variants.end()}; + + std::unique_ptr generator = + std::make_unique(0, type, variants); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); // todo: check type recursive + EXPECT_NE(setOfVariants.count(value), 0); + } + } + + template + void testGenerateVectorsPrimitive( + const TypePtr& type, + const variant& excludedValue) { + using T = typename TypeTraits::NativeType; + const uint32_t kSize = 1000; + std::shared_ptr generator = + std::make_shared( + 0, + type, + excludedValue, + std::make_unique>(0, type, 0.5)); + auto vector = + ConstrainedVectorGenerator::generateConstant(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->typeKind(), KIND); + EXPECT_TRUE(vector->isConstantEncoding()); + EXPECT_TRUE( + vector->isNullAt(0) || + vector->as>()->valueAt(0) != excludedValue); + + vector = ConstrainedVectorGenerator::generateFlat(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->typeKind(), KIND); + EXPECT_TRUE(vector->isFlatEncoding()); + bool hasNull = false; + for (auto i = 0; i < kSize; ++i) { + if (vector->isNullAt(i)) { + hasNull = true; + } else { + EXPECT_NE(vector->as>()->valueAt(i), excludedValue); + } + } + EXPECT_TRUE(hasNull); + } + + template + void testGenerateVectorsComplex(const TypePtr& type) { + using T = typename TypeTraits::ImplType; + const uint32_t kSize = 1000; + std::shared_ptr generator = + std::make_shared>(0, type, 0.5); + auto vector = + ConstrainedVectorGenerator::generateFlat(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->type(), type); + } +}; + +TEST_F(ConstrainedGeneratorsTest, randomPrimitive) { + testRandomPrimitive(INTEGER(), false); + testRandomPrimitive(VARCHAR(), false); + + testRandomPrimitive(INTEGER(), true); + testRandomPrimitive(VARCHAR(), true); +} + +TEST_F(ConstrainedGeneratorsTest, randomComplex) { + testRandomComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), false); + testRandomComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), true); +} + +TEST_F(ConstrainedGeneratorsTest, notEqPrimitive) { + testNotEqualPrimitive( + TINYINT(), static_cast(1), false); + testNotEqualPrimitive(VARCHAR(), ""_sv, false); + + testNotEqualPrimitive( + TINYINT(), static_cast(1), true); + testNotEqualPrimitive(VARCHAR(), ""_sv, true); +} + +TEST_F(ConstrainedGeneratorsTest, notEqComplex) { + auto excludedVariant = variant::array({variant::map( + {{variant{"1"}, variant::row({variant{static_cast(1)}})}})}); + testNotEqualComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), excludedVariant, false); + testNotEqualComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), excludedVariant, true); +} + +TEST_F(ConstrainedGeneratorsTest, setPrimitive) { + std::unordered_set integers{{1, 2, 3}}; + testSetPrimitive(INTEGER(), integers); + + std::unordered_set strings{{"1", "2", "3"}}; + testSetPrimitive(VARCHAR(), strings); +} + +TEST_F(ConstrainedGeneratorsTest, setComplex) { + std::vector variants{ + variant::array({variant::map( + {{variant{"1"}, variant::row({variant{static_cast(1)}})}})}), + variant::array({variant::map( + {{variant{"2"}, + variant::row({variant{static_cast(2)}})}})})}; + testSetComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), variants); +} + +TEST_F(ConstrainedGeneratorsTest, json) { + const TypePtr type = ARRAY(MAP(DOUBLE(), ROW({BIGINT()}))); + std::unique_ptr generator = + std::make_unique( + 0, + JSON(), + 0.4, + std::make_unique>(0, type, 0.4)); + + const uint32_t kIterations = 1000; + const auto& opts = generator->serializationOptions(); + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_EQ(value.kind(), TypeKind::VARCHAR); + if (value.hasValue()) { + EXPECT_TRUE(value.hasValue()); + folly::dynamic json; + auto jsonString = value.value(); + EXPECT_NO_THROW( + json = folly::parseJson(value.value(), opts)); + EXPECT_TRUE(json.isNull() || json.isArray()); + } + } +} + +TEST_F(ConstrainedGeneratorsTest, generateVectors) { + testGenerateVectorsPrimitive(BIGINT(), variant(0)); + testGenerateVectorsPrimitive(VARCHAR(), variant("")); + + testGenerateVectorsComplex( + ARRAY(ROW({MAP(VARCHAR(), BIGINT())}))); + testGenerateVectorsComplex( + MAP(ARRAY(BIGINT()), ROW({VARCHAR()}))); +} + +} // namespace facebook::velox::fuzzer::test diff --git a/velox/vector/fuzzer/CMakeLists.txt b/velox/vector/fuzzer/CMakeLists.txt index 43b8cc83746d..e28dbfdcd996 100644 --- a/velox/vector/fuzzer/CMakeLists.txt +++ b/velox/vector/fuzzer/CMakeLists.txt @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_vector_fuzzer GeneratorSpec.cpp Utils.cpp VectorFuzzer.cpp) +add_library(velox_vector_fuzzer_util Utils.cpp) target_link_libraries( - velox_vector_fuzzer velox_type velox_vector) + velox_vector_fuzzer_util velox_vector) + +add_library(velox_vector_fuzzer GeneratorSpec.cpp VectorFuzzer.cpp) + +target_link_libraries( + velox_vector_fuzzer velox_type velox_vector velox_vector_fuzzer_util) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") target_compile_options(velox_vector_fuzzer PRIVATE -Wno-deprecated-declarations) diff --git a/velox/vector/fuzzer/Utils.cpp b/velox/vector/fuzzer/Utils.cpp index 805fcf1063d7..50b50055efd7 100644 --- a/velox/vector/fuzzer/Utils.cpp +++ b/velox/vector/fuzzer/Utils.cpp @@ -16,13 +16,89 @@ #include "velox/vector/fuzzer/Utils.h" -namespace facebook::velox::generator_spec_utils { +namespace facebook::velox { bool coinToss(FuzzerGenerator& rng, double threshold) { static std::uniform_real_distribution<> dist(0.0, 1.0); return dist(rng) < threshold; } +Timestamp randTimestamp( + FuzzerGenerator& rng, + FuzzerTimestampPrecision timestampPrecision) { + // Generate timestamps only in the valid range to avoid datetime functions, + // such as try_cast(varchar as timestamp), throwing VeloxRuntimeError in + // fuzzers. + constexpr int64_t min = -2'140'671'600; + constexpr int64_t max = 2'140'671'600; + constexpr int64_t microInSecond = 1'000'000; + constexpr int64_t millisInSecond = 1'000; + // DWRF requires nano to be in a certain range. Hardcode the value here to + // avoid the dependency on DWRF. + constexpr int64_t MAX_NANOS = 1'000'000'000; + + switch (timestampPrecision) { + case FuzzerTimestampPrecision::kNanoSeconds: + return Timestamp( + rand(rng, min, max), (rand(rng) % MAX_NANOS)); + case FuzzerTimestampPrecision::kMicroSeconds: + return Timestamp::fromMicros( + rand(rng, min, max) * microInSecond + + rand(rng, -microInSecond, microInSecond)); + case FuzzerTimestampPrecision::kMilliSeconds: + return Timestamp::fromMillis( + rand(rng, min, max) * millisInSecond + + rand(rng, -millisInSecond, millisInSecond)); + case FuzzerTimestampPrecision::kSeconds: + return Timestamp(rand(rng, min, max), 0); + } + return {}; // no-op. +} + +int32_t randDate(FuzzerGenerator& rng) { + constexpr int64_t min = -24'450; + constexpr int64_t max = 24'450; + return rand(rng, min, max); +} + +FOLLY_ALWAYS_INLINE char16_t getRandomChar( + FuzzerGenerator& rng, + const std::vector>& charSet) { + const auto& chars = charSet.size() == 1 + ? charSet.front() + : charSet[rand(rng) % charSet.size()]; + auto size = chars.second - chars.first; + auto inc = (rand(rng) % size); + char16_t res = chars.first + inc; + return res; +} + +std::string randString( + FuzzerGenerator& rng, + size_t length, + const std::vector& encodings, + std::string& buf, + std::wstring_convert, char16_t>& converter) { + buf.clear(); + std::u16string wbuf; + wbuf.resize(length); + + for (size_t i = 0; i < length; ++i) { + // First choose a random encoding from the list of input acceptable + // encodings. + VELOX_CHECK_GE(encodings.size(), 1); + const auto& encoding = (encodings.size() == 1) + ? encodings.front() + : encodings[rand(rng) % encodings.size()]; + + wbuf[i] = getRandomChar(rng, kUTFChatSets[encoding]); + } + buf.append(converter.to_bytes(wbuf)); + return buf; +} + +namespace generator_spec_utils { + vector_size_t getRandomIndex(FuzzerGenerator& rng, vector_size_t maxIndex) { std::uniform_int_distribution indexGenerator( 0, maxIndex); // generates index in [0, maxIndex] @@ -59,4 +135,6 @@ BufferPtr generateIndicesBuffer( return indices; } -} // namespace facebook::velox::generator_spec_utils +} // namespace generator_spec_utils + +} // namespace facebook::velox diff --git a/velox/vector/fuzzer/Utils.h b/velox/vector/fuzzer/Utils.h index 0248b08f942f..0010e511593c 100644 --- a/velox/vector/fuzzer/Utils.h +++ b/velox/vector/fuzzer/Utils.h @@ -16,6 +16,12 @@ #pragma once +#include +#include + +#include +#include + #include "velox/vector/BaseVector.h" #include "velox/vector/NullsBuilder.h" @@ -23,6 +29,57 @@ namespace facebook::velox { using FuzzerGenerator = std::mt19937; +enum UTF8CharList { + ASCII = 0, // Ascii character set. + UNICODE_CASE_SENSITIVE = 1, // Unicode scripts that support case. + EXTENDED_UNICODE = 2, // Extended Unicode: Arabic, Devanagiri etc + MATHEMATICAL_SYMBOLS = 3 // Mathematical Symbols. +}; + +/// Unicode character ranges. Ensure the vector indexes match the UTF8CharList +/// enum values. +/// +/// Source: https://jrgraphix.net/research/unicode_blocks.php +static const std::vector>> + kUTFChatSets{ + // UTF8CharList::ASCII + { + {33, 127}, // All ASCII printable chars. + }, + // UTF8CharList::UNICODE_CASE_SENSITIVE + { + {u'\u0020', u'\u007F'}, // Basic Latin. + {u'\u0400', u'\u04FF'}, // Cyrillic. + }, + // UTF8CharList::EXTENDED_UNICODE + { + {u'\u03F0', u'\u03FF'}, // Greek. + {u'\u0100', u'\u017F'}, // Latin Extended A. + {u'\u0600', u'\u06FF'}, // Arabic. + {u'\u0900', u'\u097F'}, // Devanagari. + {u'\u0600', u'\u06FF'}, // Hebrew. + {u'\u3040', u'\u309F'}, // Hiragana. + {u'\u2000', u'\u206F'}, // Punctuation. + {u'\u2070', u'\u209F'}, // Sub/Super Script. + {u'\u20A0', u'\u20CF'}, // Currency. + }, + // UTF8CharList::MATHEMATICAL_SYMBOLS + { + {u'\u2200', u'\u22FF'}, // Math Operators. + {u'\u2150', u'\u218F'}, // Number Forms. + {u'\u25A0', u'\u25FF'}, // Geometric Shapes. + {u'\u27C0', u'\u27EF'}, // Math Symbols. + {u'\u2A00', u'\u2AFF'}, // Supplemental. + }, + }; + +bool coinToss(FuzzerGenerator& rng, double threshold); + +struct DataSpec { + bool includeNaN; + bool includeInfinity; +}; + enum class FuzzerTimestampPrecision : int8_t { kNanoSeconds = 0, kMicroSeconds = 1, @@ -30,9 +87,105 @@ enum class FuzzerTimestampPrecision : int8_t { kSeconds = 3, }; -namespace generator_spec_utils { +// Generate random values for the different supported types. +template +inline T rand(FuzzerGenerator& rng, DataSpec dataSpec = {false, false}) { + VELOX_NYI(); +} -bool coinToss(FuzzerGenerator& rng, double threshold); +template <> +inline int8_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline int16_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline int32_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline int64_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline double rand(FuzzerGenerator& rng, DataSpec dataSpec) { + if (dataSpec.includeNaN && coinToss(rng, 0.05)) { + return std::nan(""); + } + + if (dataSpec.includeInfinity && coinToss(rng, 0.05)) { + return std::numeric_limits::infinity(); + } + + return boost::random::uniform_01()(rng); +} + +template <> +inline float rand(FuzzerGenerator& rng, DataSpec dataSpec) { + if (dataSpec.includeNaN && coinToss(rng, 0.05)) { + return std::nanf(""); + } + + if (dataSpec.includeInfinity && coinToss(rng, 0.05)) { + return std::numeric_limits::infinity(); + } + + return boost::random::uniform_01()(rng); +} + +template <> +inline bool rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution(0, 1)(rng); +} + +template <> +inline uint32_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline uint64_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +inline int128_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + return HugeInt::build(rand(rng), rand(rng)); +} + +Timestamp randTimestamp( + FuzzerGenerator& rng, + FuzzerTimestampPrecision timestampPrecision); + +template <> +inline Timestamp rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { + // TODO: support other timestamp precisions. + return randTimestamp(rng, FuzzerTimestampPrecision::kMicroSeconds); +} + +int32_t randDate(FuzzerGenerator& rng); + +template , int> = 0> +inline T rand(FuzzerGenerator& rng, T min, T max) { + return boost::random::uniform_int_distribution(min, max)(rng); +} + +/// Generates a random string in buf with characters of encodings. Return buf at +/// the end. +std::string randString( + FuzzerGenerator& rng, + size_t length, + const std::vector& encodings, + std::string& buf, + std::wstring_convert, char16_t>& converter); + +namespace generator_spec_utils { vector_size_t getRandomIndex(FuzzerGenerator& rng, vector_size_t maxIndex); diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index 9797bc56632a..8f89eb2a09a7 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -27,15 +27,12 @@ #include "velox/vector/FlatVector.h" #include "velox/vector/NullsBuilder.h" #include "velox/vector/VectorTypeUtils.h" +#include "velox/vector/fuzzer/Utils.h" namespace facebook::velox { namespace { -// DWRF requires nano to be in a certain range. Hardcode the value here to avoid -// the dependency on DWRF. -constexpr int64_t MAX_NANOS = 1'000'000'000; - // Structure to help temporary changes to Options. This objects saves the // current state of the Options object, and restores it when it's destructed. // For instance, if you would like to temporarily disable nulls for a particular @@ -62,116 +59,6 @@ struct ScopedOptions { VectorFuzzer::Options savedOpts; }; -// Generate random values for the different supported types. -template -T rand(FuzzerGenerator& rng, DataSpec dataSpec = {false, false}) { - VELOX_NYI(); -} - -template <> -int8_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -int16_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -int32_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -int64_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -double rand(FuzzerGenerator& rng, DataSpec dataSpec) { - if (dataSpec.includeNaN && coinToss(rng, 0.05)) { - return std::nan(""); - } - - if (dataSpec.includeInfinity && coinToss(rng, 0.05)) { - return std::numeric_limits::infinity(); - } - - return boost::random::uniform_01()(rng); -} - -template <> -float rand(FuzzerGenerator& rng, DataSpec dataSpec) { - if (dataSpec.includeNaN && coinToss(rng, 0.05)) { - return std::nanf(""); - } - - if (dataSpec.includeInfinity && coinToss(rng, 0.05)) { - return std::numeric_limits::infinity(); - } - - return boost::random::uniform_01()(rng); -} - -template <> -bool rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution(0, 1)(rng); -} - -template <> -uint32_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -uint64_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return boost::random::uniform_int_distribution()(rng); -} - -template <> -int128_t rand(FuzzerGenerator& rng, DataSpec /*dataSpec*/) { - return HugeInt::build(rand(rng), rand(rng)); -} - -template , int> = 0> -T rand(FuzzerGenerator& rng, T min, T max) { - return boost::random::uniform_int_distribution(min, max)(rng); -} - -Timestamp randTimestamp(FuzzerGenerator& rng, VectorFuzzer::Options opts) { - // Generate timestamps only in the valid range to avoid datetime functions, - // such as try_cast(varchar as timestamp), throwing VeloxRuntimeError in - // fuzzers. - constexpr int64_t min = -2'140'671'600; - constexpr int64_t max = 2'140'671'600; - constexpr int64_t microInSecond = 1'000'000; - constexpr int64_t millisInSecond = 1'000; - - switch (opts.timestampPrecision) { - case FuzzerTimestampPrecision::kNanoSeconds: - return Timestamp( - rand(rng, min, max), (rand(rng) % MAX_NANOS)); - case FuzzerTimestampPrecision::kMicroSeconds: - return Timestamp::fromMicros( - rand(rng, min, max) * microInSecond + - rand(rng, -microInSecond, microInSecond)); - case FuzzerTimestampPrecision::kMilliSeconds: - return Timestamp::fromMillis( - rand(rng, min, max) * millisInSecond + - rand(rng, -millisInSecond, millisInSecond)); - case FuzzerTimestampPrecision::kSeconds: - return Timestamp(rand(rng, min, max), 0); - } - return {}; // no-op. -} - -int32_t randDate(FuzzerGenerator& rng) { - constexpr int64_t min = -24'450; - constexpr int64_t max = 24'450; - return rand(rng, min, max); -} - size_t getElementsVectorLength( const VectorFuzzer::Options& opts, vector_size_t size) { @@ -196,43 +83,7 @@ int128_t randLongDecimal(const TypePtr& type, FuzzerGenerator& rng) { return rand(rng) % DecimalUtil::kPowersOfTen[precision]; } -/// Unicode character ranges. Ensure the vector indexes match the UTF8CharList -/// enum values. -/// -/// Source: https://jrgraphix.net/research/unicode_blocks.php -const std::vector>> kUTFChatSets{ - // UTF8CharList::ASCII - { - {33, 127}, // All ASCII printable chars. - }, - // UTF8CharList::UNICODE_CASE_SENSITIVE - { - {u'\u0020', u'\u007F'}, // Basic Latin. - {u'\u0400', u'\u04FF'}, // Cyrillic. - }, - // UTF8CharList::EXTENDED_UNICODE - { - {u'\u03F0', u'\u03FF'}, // Greek. - {u'\u0100', u'\u017F'}, // Latin Extended A. - {u'\u0600', u'\u06FF'}, // Arabic. - {u'\u0900', u'\u097F'}, // Devanagari. - {u'\u0600', u'\u06FF'}, // Hebrew. - {u'\u3040', u'\u309F'}, // Hiragana. - {u'\u2000', u'\u206F'}, // Punctuation. - {u'\u2070', u'\u209F'}, // Sub/Super Script. - {u'\u20A0', u'\u20CF'}, // Currency. - }, - // UTF8CharList::MATHEMATICAL_SYMBOLS - { - {u'\u2200', u'\u22FF'}, // Math Operators. - {u'\u2150', u'\u218F'}, // Number Forms. - {u'\u25A0', u'\u25FF'}, // Geometric Shapes. - {u'\u27C0', u'\u27EF'}, // Math Symbols. - {u'\u2A00', u'\u2AFF'}, // Supplemental. - }, -}; - -FOLLY_ALWAYS_INLINE char16_t getRandomChar( +/*FOLLY_ALWAYS_INLINE char16_t getRandomChar( FuzzerGenerator& rng, const std::vector>& charSet) { const auto& chars = charSet.size() == 1 @@ -242,7 +93,7 @@ FOLLY_ALWAYS_INLINE char16_t getRandomChar( auto inc = (rand(rng) % size); char16_t res = chars.first + inc; return res; -} +}*/ /// Generates a random string (string size and encoding are passed through /// Options). Returns a StringView which uses `buf` as the underlying buffer. @@ -251,24 +102,11 @@ StringView randString( const VectorFuzzer::Options& opts, std::string& buf, std::wstring_convert, char16_t>& converter) { - buf.clear(); - std::u16string wbuf; const size_t stringLength = opts.stringVariableLength ? rand(rng) % opts.stringLength : opts.stringLength; - wbuf.resize(stringLength); - - for (size_t i = 0; i < stringLength; ++i) { - // First choose a random encoding from the list of input acceptable - // encodings. - const auto& encoding = (opts.charEncodings.size() == 1) - ? opts.charEncodings.front() - : opts.charEncodings[rand(rng) % opts.charEncodings.size()]; - - wbuf[i] = getRandomChar(rng, kUTFChatSets[encoding]); - } - buf.append(converter.to_bytes(wbuf)); + randString(rng, stringLength, opts.charEncodings, buf, converter); return StringView(buf); } @@ -290,7 +128,7 @@ VectorPtr fuzzConstantPrimitiveImpl( } if constexpr (std::is_same_v) { return std::make_shared>( - pool, size, false, type, randTimestamp(rng, opts)); + pool, size, false, type, randTimestamp(rng, opts.timestampPrecision)); } else if (type->isDate()) { return std::make_shared>( pool, size, false, type, randDate(rng)); @@ -322,7 +160,7 @@ void fuzzFlatPrimitiveImpl( if constexpr (std::is_same_v) { flatVector->set(i, randString(rng, opts, strBuf, converter)); } else if constexpr (std::is_same_v) { - flatVector->set(i, randTimestamp(rng, opts)); + flatVector->set(i, randTimestamp(rng, opts.timestampPrecision)); } else if constexpr (std::is_same_v) { if (vector->type()->isShortDecimal()) { flatVector->set(i, randShortDecimal(vector->type(), rng)); diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 8ab96c7cb433..4d4712f0261b 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -27,18 +27,6 @@ namespace facebook::velox { -enum UTF8CharList { - ASCII = 0, // Ascii character set. - UNICODE_CASE_SENSITIVE = 1, // Unicode scripts that support case. - EXTENDED_UNICODE = 2, // Extended Unicode: Arabic, Devanagiri etc - MATHEMATICAL_SYMBOLS = 3 // Mathematical Symbols. -}; - -struct DataSpec { - bool includeNaN; - bool includeInfinity; -}; - const std::vector& defaultScalarTypes(); /// VectorFuzzer is a helper class that generates randomized vectors and their