diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp new file mode 100644 index 0000000000000..736003a3d2847 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +#include + +#include "velox/vector/fuzzer/Utils.h" + +namespace facebook::velox::fuzzer { + +FOLLY_ALWAYS_INLINE char16_t getRandomChar( + FuzzerGenerator& rng, + const std::vector>& charSet) { + const auto& chars = charSet.size() == 1 + ? charSet.front() + : charSet[rand(rng) % charSet.size()]; + auto size = chars.second - chars.first; + auto inc = (rand(rng) % size); + char16_t res = chars.first + inc; + return res; +} + +/// Generates a random string (string size and encoding are passed through +/// Options). +std::string randString( + FuzzerGenerator& rng, + size_t length, + UTF8CharList encoding, + std::wstring_convert, char16_t>& converter) { + std::string buf; + std::u16string wbuf; + wbuf.resize(length); + + for (size_t i = 0; i < length; ++i) { + wbuf[i] = getRandomChar(rng, kUTFChatSets[encoding]); + } + buf.append(converter.to_bytes(wbuf)); + return buf; +} + +// AbstractInputGenerator +AbstractInputGenerator::AbstractInputGenerator( + size_t seed, + const TypePtr& type, + std::unique_ptr&& next) + : type_{type}, next_{std::move(next)} { + rng_.seed(seed); +} + +// NotEqualConstrainedGenerator +variant NotEqualConstrainedGenerator::generate() { + variant value; + do { + value = next_->generate(); + } while (value == excludedValue_); + return value; +} + +// SetConstrainedGenerator +variant SetConstrainedGenerator::generate() { + const auto index = + boost::random::uniform_int_distribution(0, set_.size() - 1)(rng_); + return set_[index]; +} + +// JsonInputGenerator +folly::json::serialization_opts JsonInputGenerator::getSerializationOptions() { + folly::json::serialization_opts opts; + opts.allow_non_string_keys = true; + opts.allow_nan_inf = true; + if (makeRandomVariation_) { + opts.convert_int_keys = rand(rng_); + opts.pretty_formatting = rand(rng_); + opts.pretty_formatting_indent_width = rand(rng_, 0, 4); + opts.encode_non_ascii = rand(rng_); + opts.allow_trailing_comma = rand(rng_); + opts.sort_keys = rand(rng_); + opts.skip_invalid_utf8 = rand(rng_); + opts.parse_numbers_as_strings = rand(rng_); + } + return opts; +} + +variant JsonInputGenerator::generate() { + const auto object = objectGenerator_->generate(); + const folly::dynamic jsonObject = convertVariantToDynamic(object); + const auto jsonString = folly::json::serialize(jsonObject, opts_); + if (makeRandomVariation_ && coinToss(rng_, 0.5)) { + makeRandomVariation(jsonString); + } + return variant(jsonString); +} + +folly::dynamic JsonInputGenerator::convertVariantToDynamic( + const variant& object) { + if (object.isNull()) { + return folly::dynamic(); + } + + switch (object.kind()) { + case TypeKind::BOOLEAN: + return convertVariantToDynamicPrimitive(object); + case TypeKind::TINYINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::SMALLINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::INTEGER: + return convertVariantToDynamicPrimitive(object); + case TypeKind::BIGINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::REAL: + return convertVariantToDynamicPrimitive(object); + case TypeKind::DOUBLE: + return convertVariantToDynamicPrimitive(object); + case TypeKind::VARCHAR: + return convertVariantToDynamicPrimitive(object); + case TypeKind::VARBINARY: + return convertVariantToDynamicPrimitive(object); + case TypeKind::TIMESTAMP: + return convertVariantToDynamicPrimitive(object); + case TypeKind::HUGEINT: + return convertVariantToDynamicPrimitive(object); + case TypeKind::ARRAY: { + folly::dynamic array = folly::dynamic::array; + for (const auto& element : object.value()) { + array.push_back(convertVariantToDynamic(element)); + } + return array; + } + case TypeKind::MAP: { + folly::dynamic map = folly::dynamic::object; + for (const auto& [key, value] : object.value()) { + map[convertVariantToDynamic(key)] = convertVariantToDynamic(value); + } + return map; + } + case TypeKind::ROW: { + folly::dynamic array = folly::dynamic::array; + for (const auto& element : object.value()) { + array.push_back(convertVariantToDynamic(element)); + } + return array; + } + default: + VELOX_UNREACHABLE("Unsupported type"); + } +} + +std::vector getControlCharacters() { + static std::vector controlCharacters = { + "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", + "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", + "\x0E", "\x0F", "\x10", "\x11", "\x12", "\x13", "\x14", + "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", + "\x1C", "\x1D", "\x1E", "\x1F", "\x20", "\x7F", "\u0080", + "\u0081", "\u0082", "\u0083", "\u0084", "\u0085", "\u0086", "\u0087", + "\u0088", "\u0089", "\u008A", "\u008B", "\u008C", "\u008D", "\u008E", + "\u008F", "\u0090", "\u0091", "\u0092", "\u0093", "\u0094", "\u0095", + "\u0096", "\u0097", "\u0098", "\u0099", "\u009A", "\u009B", "\u009C", + "\u009D", "\u009E", "\u009F"}; + return controlCharacters; +}; + +void JsonInputGenerator::makeRandomVariation(std::string json) { + if (coinToss(rng_, 0.1)) { + const auto controlCharacters = getControlCharacters(); + const auto index = rand(rng_, 0, controlCharacters.size() - 1); + const auto controlCharacter = controlCharacters[index]; + const auto indexToInsert = rand(rng_, 0, json.size()); + json.insert(indexToInsert, controlCharacter); + } else if (coinToss(rng_, 0.1)) { + const auto size = rand(rng_, 0, json.size()); + json.resize(size); + } +} + +// Utility functions +template +std::unique_ptr getRandomInputGeneratorPrimitive( + size_t seed, + const TypePtr& type) { + using T = typename TypeTraits::NativeType; + std::unique_ptr generator = + std::make_unique>(seed, type); + return generator; +} + +std::unique_ptr getRandomInputGenerator( + size_t seed, + const TypePtr& type) { + std::unique_ptr generator; + if (type->isPrimitiveType()) { + return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + getRandomInputGeneratorPrimitive, false, type->kind(), seed, type); + } else if (type->isArray()) { + generator = std::make_unique>(seed, type); + } else if (type->isMap()) { + generator = std::make_unique>(seed, type); + + } else if (type->isRow()) { + generator = std::make_unique>( + seed, type, std::vector>{}); + } + return generator; +} + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h new file mode 100644 index 0000000000000..9bb286a0949d3 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h @@ -0,0 +1,417 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include "folly/json.h" + +#include "velox/type/Type.h" +#include "velox/type/Variant.h" +#include "velox/vector/fuzzer/Utils.h" + +namespace facebook::velox::fuzzer { + +using facebook::velox::variant; + +enum class ConstraintType { + kEqual, + kNotEqual, + kRange, + kSet, + kAlphabet, + kLength, + kElement, + kContain, + kSum, +}; + +class AbstractInputGenerator { + public: + AbstractInputGenerator( + size_t seed, + const TypePtr& type, + std::unique_ptr&& next); + + virtual ~AbstractInputGenerator() = default; + + virtual variant generate() = 0; + + TypePtr type() const { + return type_; + } + + protected: + FuzzerGenerator rng_; + + TypePtr type_; + + std::unique_ptr next_; +}; + +namespace { + +// Generate random values for the different supported types. +template +T rand(FuzzerGenerator& rng) { + VELOX_NYI(); +} + +template <> +int8_t rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +int16_t rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +int32_t rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +int64_t rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +uint32_t rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng); +} + +template <> +double rand(FuzzerGenerator& rng) { + if (coinToss(rng, 0.05)) { + return std::nan(""); + } + + if (coinToss(rng, 0.05)) { + return std::numeric_limits::infinity(); + } + + return boost::random::uniform_01()(rng); +} + +template <> +float rand(FuzzerGenerator& rng) { + if (coinToss(rng, 0.05)) { + return std::nanf(""); + } + + if (coinToss(rng, 0.05)) { + return std::numeric_limits::infinity(); + } + + return boost::random::uniform_01()(rng); +} + +template <> +bool rand(FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution(0, 1)(rng); +} + +template <> +int128_t rand(FuzzerGenerator& rng) { + return HugeInt::build(rand(rng), rand(rng)); +} + +template , int> = 0> +T rand(FuzzerGenerator& rng, T min, T max) { + return boost::random::uniform_int_distribution(min, max)(rng); +} + +} // namespace + +/// Generates a random string (string size and encoding are passed through +/// Options). +std::string randString( + FuzzerGenerator& rng, + size_t length, + UTF8CharList encoding, + std::wstring_convert, char16_t>& converter); + +std::unique_ptr getRandomInputGenerator( + size_t seed, + const TypePtr& type); + +template +class RandomInputGenerator : public AbstractInputGenerator { + public: + RandomInputGenerator(size_t seed, const TypePtr& type) + : AbstractInputGenerator(seed, type, nullptr) {} + + ~RandomInputGenerator() override = default; + + variant generate() override { + return variant(rand(rng_)); + } +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type) + : AbstractInputGenerator(seed, type, nullptr) {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + const auto length = rand(rng_, 0, kDefaultMaxLength); + std::wstring_convert, char16_t> converter; + return variant(randString(rng_, length, encoding_, converter)); + } + + variant generate(size_t length, UTF8CharList encoding) { + std::wstring_convert, char16_t> converter; + return variant(randString(rng_, length, encoding, converter)); + } + + private: + const size_t kDefaultMaxLength = 4000; + + UTF8CharList encoding_ = UTF8CharList::ASCII; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + size_t maxLength = 10, + std::unique_ptr&& elementGenerator = nullptr, + std::optional containAtIndex = std::nullopt, + std::unique_ptr&& containGenerator = nullptr) + : AbstractInputGenerator(seed, type, nullptr), + maxLength_{maxLength}, + elementGenerator_{ + elementGenerator ? std::move(elementGenerator) + : getRandomInputGenerator(seed, type->childAt(0))}, + containAtIndex_{containAtIndex}, + containGenerator_{std::move(containGenerator)} {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + const auto length = rand(rng_, 0, maxLength_); + std::vector elements; + elements.reserve(length); + for (size_t i = 0; i < length; ++i) { + if UNLIKELY (containAtIndex_.has_value() && *containAtIndex_ == i) { + elements.push_back(containGenerator_->generate()); + } else { + elements.push_back(elementGenerator_->generate()); + } + } + return variant::array(elements); + } + + private: + const size_t maxLength_; + + std::unique_ptr elementGenerator_; + + std::optional containAtIndex_; + + std::unique_ptr containGenerator_; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + size_t maxLength = 10, + std::unique_ptr&& keyGenerator = nullptr, + std::unique_ptr&& valueGenerator = nullptr, + std::optional containAtIndex = std::nullopt, + std::unique_ptr&& containKeyGenerator = nullptr, + std::unique_ptr&& containValueGenerator = nullptr) + : AbstractInputGenerator(seed, type, nullptr), + maxLength_{maxLength}, + keyGenerator_{ + keyGenerator ? std::move(keyGenerator) + : getRandomInputGenerator(seed, type->childAt(0))}, + valueGenerator_{ + valueGenerator ? std::move(valueGenerator) + : getRandomInputGenerator(seed, type->childAt(1))}, + containAtIndex_{containAtIndex}, + containKeyGenerator_{std::move(containKeyGenerator)}, + containValueGenerator_{std::move(containValueGenerator)} {} + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + const auto length = rand(rng_, 0, maxLength_); + std::map map; + for (size_t i = 0; i < length; ++i) { + if UNLIKELY (containAtIndex_.has_value() && *containAtIndex_ == i) { + map.emplace( + containKeyGenerator_->generate(), + containValueGenerator_->generate()); + } else { + map.emplace(keyGenerator_->generate(), valueGenerator_->generate()); + } + } + return variant::map(map); + } + + private: + const size_t maxLength_; + + std::unique_ptr keyGenerator_; + + std::unique_ptr valueGenerator_; + + std::optional containAtIndex_; + + std::unique_ptr containKeyGenerator_; + + std::unique_ptr containValueGenerator_; +}; + +template +class RandomInputGenerator>> + : public AbstractInputGenerator { + public: + RandomInputGenerator>>( + size_t seed, + const TypePtr& type, + std::vector> fieldGenerators) + : AbstractInputGenerator(seed, type, nullptr) { + const auto length = type->size(); + fieldGenerators_ = std::move(fieldGenerators); + for (size_t i = 0; i < length; ++i) { + if (fieldGenerators_.size() <= i) { + fieldGenerators_.push_back( + getRandomInputGenerator(seed, type->childAt(i))); + } else if (fieldGenerators_[i] == nullptr) { + fieldGenerators_[i] = getRandomInputGenerator(seed, type->childAt(i)); + } + } + } + + ~RandomInputGenerator>>() + override = default; + + variant generate() override { + const auto length = type_->size(); + std::vector fields; + fields.reserve(length); + for (size_t i = 0; i < length; ++i) { + fields.push_back(fieldGenerators_[i]->generate()); + } + return variant::row(fields); + } + + private: + std::vector> fieldGenerators_; +}; + +class NotEqualConstrainedGenerator : public AbstractInputGenerator { + public: + NotEqualConstrainedGenerator( + size_t seed, + const TypePtr& type, + const variant& excludedValue, + std::unique_ptr&& next) + : AbstractInputGenerator(seed, type, std::move(next)), + excludedValue_{excludedValue} {} + + ~NotEqualConstrainedGenerator() override = default; + + variant generate() override; + + private: + variant excludedValue_; +}; + +class SetConstrainedGenerator : public AbstractInputGenerator { + public: + SetConstrainedGenerator( + size_t seed, + const TypePtr& type, + const std::vector& set) + : AbstractInputGenerator(seed, type, nullptr), set_{set} {} + + ~SetConstrainedGenerator() override = default; + + variant generate() override; + + private: + std::vector set_; +}; + +class JsonInputGenerator : public AbstractInputGenerator { + public: + JsonInputGenerator( + size_t seed, + const TypePtr& type, + std::unique_ptr&& objectGenerator, + bool makeRandomVariation = false) + : AbstractInputGenerator(seed, type, nullptr), + objectGenerator_{std::move(objectGenerator)}, + makeRandomVariation_{makeRandomVariation}, + opts_{getSerializationOptions()} {} + + ~JsonInputGenerator() override = default; + + variant generate() override; + + const folly::json::serialization_opts& serializationOptions() const { + return opts_; + } + + private: + template + folly::dynamic convertVariantToDynamicPrimitive(const variant& v) { + using T = typename TypeTraits::DeepCopiedType; + VELOX_CHECK(v.isSet()); + const T value = v.value(); + return folly::dynamic(value); + } + + folly::dynamic convertVariantToDynamic(const variant& object); + + void makeRandomVariation(std::string json); + + folly::json::serialization_opts getSerializationOptions(); + + std::unique_ptr objectGenerator_; + + bool makeRandomVariation_; + + folly::json::serialization_opts opts_; +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp new file mode 100644 index 0000000000000..20797769ea766 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h" + +#include "velox/expression/VectorWriters.h" + +namespace facebook::velox::fuzzer { + +using exec::GenericWriter; +using exec::VectorWriter; + +// static +VectorPtr ConstrainedVectorGenerator::generateConstant( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(customGenerator); + VELOX_CHECK(customGenerator->type()->isPrimitiveType()); + + const auto& type = customGenerator->type(); + const auto variant = customGenerator->generate(); + + return BaseVector::createConstant(type, variant, size, pool); +} + +template +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); +template <> +void writeOne(const variant& value, GenericWriter& writer); + +template +void writeOne(const variant& value, GenericWriter& writer) { + using T = typename TypeTraits::NativeType; + writer.template castTo() = value.value(); +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + writer.template castTo() = value.value(); +} + +template <> +void writeOne( + const variant& value, + GenericWriter& writer) { + writer.template castTo() = value.value(); +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo>(); + const auto& elements = value.array(); + for (const auto& element : elements) { + if (element.isNull()) { + writerTyped.add_null(); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, element.kind(), element, writerTyped.add_item()); + } + } +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo>(); + const auto& map = value.map(); + for (const auto& pair : map) { + const auto& key = pair.first; + const auto& value = pair.second; + VELOX_CHECK(!key.isNull()); + if (value.isNull()) { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, key.kind(), key, writerTyped.add_null()); + } else { + auto writers = writerTyped.add_item(); + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, key.kind(), key, std::get<0>(writers)); + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, value.kind(), value, std::get<1>(writers)); + } + } +} + +template <> +void writeOne(const variant& value, GenericWriter& writer) { + auto& writerTyped = writer.template castTo(); + const auto& elements = value.row(); + column_index_t i = 0; + for (const auto& element : elements) { + if (element.isNull()) { + writerTyped.set_null_at(i); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, element.kind(), element, writerTyped.get_writer_at(i)); + } + i++; + } +} + +// static +VectorPtr ConstrainedVectorGenerator::generateFlat( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool) { + VELOX_CHECK_NOT_NULL(customGenerator); + + VectorPtr result; + const auto& type = customGenerator->type(); + BaseVector::ensureWritable(SelectivityVector(size), type, pool, result); + VectorWriter writer; + writer.init(*result); + + for (auto i = 0; i < size; ++i) { + writer.setOffset(i); + const auto variant = customGenerator->generate(); + if (variant.isNull()) { + writer.commitNull(); + } else { + VELOX_DYNAMIC_TYPE_DISPATCH( + writeOne, type->kind(), variant, writer.current()); + writer.commit(true); + } + } + return result; +} + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h new file mode 100644 index 0000000000000..555b905f46b0a --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +namespace facebook::velox::fuzzer { + +class ConstrainedVectorGenerator { + public: + ConstrainedVectorGenerator() = delete; + + static VectorPtr generateConstant( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool); + + static VectorPtr generateFlat( + const std::shared_ptr& customGenerator, + vector_size_t size, + memory::MemoryPool* pool); +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp b/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp new file mode 100644 index 0000000000000..037b76ba88586 --- /dev/null +++ b/velox/experimental/fuzzer_input_generator/tests/ConstrainedGeneratorsTest.cpp @@ -0,0 +1,260 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" + +#include + +#include "velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h" +#include "velox/functions/prestosql/types/JsonType.h" +#include "velox/type/Variant.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +namespace facebook::velox::fuzzer::test { + +class ConstrainedGeneratorsTest : public testing::Test, + public velox::test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + template + void testRandomPrimitive(const TypePtr& type) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + std::unique_ptr generator = + std::make_unique>(0, type); + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); + } + + template + void testRandomComplex(const TypePtr& type) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::unique_ptr generator = + std::make_unique>(0, type); + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); // TODO: check type recursive + } + + template + void testNotEqualPrimitive(const TypePtr& type, const auto& excludedValue) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + variant excludedVariant{excludedValue}; + std::unique_ptr generator = + std::make_unique( + 0, + type, + excludedVariant, + std::make_unique>(0, type)); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); + EXPECT_NE(value, excludedVariant); + } + } + + template + void testNotEqualComplex( + const TypePtr& type, + const variant& excludedVariant) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::unique_ptr generator = + std::make_unique( + 0, + type, + excludedVariant, + std::make_unique>(0, type)); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); // todo: check type recursive + EXPECT_NE(value, excludedVariant); + } + } + + template + void testSetPrimitive(const TypePtr& type, const auto& setOfRawValues) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::NativeType; + + const uint32_t kIterations = 1000; + std::vector variants; + for (const auto& value : setOfRawValues) { + variants.push_back(variant{value}); + } + std::unique_ptr generator = + std::make_unique(0, type, variants); + + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); + EXPECT_NE(setOfRawValues.count(value.value()), 0); + } + } + + template + void testSetComplex( + const TypePtr& type, + const std::vector& variants) { + VELOX_CHECK_EQ(type->kind(), KIND); + using T = typename TypeTraits::ImplType; + + std::set setOfVariants{variants.begin(), variants.end()}; + + std::unique_ptr generator = + std::make_unique(0, type, variants); + + const uint32_t kIterations = 1000; + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), KIND); // todo: check type recursive + EXPECT_NE(setOfVariants.count(value), 0); + } + } + + template + void testGenerateVectorsPrimitive( + const TypePtr& type, + const variant& excludedValue) { + using T = typename TypeTraits::NativeType; + const uint32_t kSize = 1000; + std::shared_ptr generator = + std::make_shared( + 0, + type, + excludedValue, + std::make_unique>(0, type)); + auto vector = + ConstrainedVectorGenerator::generateConstant(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->typeKind(), KIND); + EXPECT_TRUE(vector->isConstantEncoding()); + EXPECT_NE(vector->as>()->valueAt(0), excludedValue); + + vector = ConstrainedVectorGenerator::generateFlat(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->typeKind(), KIND); + EXPECT_TRUE(vector->isFlatEncoding()); + for (auto i = 0; i < kSize; ++i) { + EXPECT_NE(vector->as>()->valueAt(i), excludedValue); + } + } + + template + void testGenerateVectorsComplex(const TypePtr& type) { + using T = typename TypeTraits::ImplType; + const uint32_t kSize = 1000; + std::shared_ptr generator = + std::make_shared>(0, type); + auto vector = + ConstrainedVectorGenerator::generateFlat(generator, kSize, pool()); + EXPECT_EQ(vector->size(), kSize); + EXPECT_EQ(vector->type(), type); + } +}; + +TEST_F(ConstrainedGeneratorsTest, randomPrimitive) { + testRandomPrimitive(INTEGER()); + + testRandomPrimitive(VARCHAR()); +} + +TEST_F(ConstrainedGeneratorsTest, randomComplex) { + testRandomComplex(ARRAY(MAP(VARCHAR(), ROW({BIGINT()})))); +} + +TEST_F(ConstrainedGeneratorsTest, notEqPrimitive) { + testNotEqualPrimitive(TINYINT(), static_cast(1)); + + testNotEqualPrimitive(VARCHAR(), ""_sv); +} + +TEST_F(ConstrainedGeneratorsTest, notEqComplex) { + auto excludedVariant = variant::array({variant::map( + {{variant{"1"}, variant::row({variant{static_cast(1)}})}})}); + testNotEqualComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), excludedVariant); +} + +TEST_F(ConstrainedGeneratorsTest, setPrimitive) { + std::unordered_set integers{{1, 2, 3}}; + testSetPrimitive(INTEGER(), integers); + + std::unordered_set strings{{"1", "2", "3"}}; + testSetPrimitive(VARCHAR(), strings); +} + +TEST_F(ConstrainedGeneratorsTest, setComplex) { + std::vector variants{ + variant::array({variant::map( + {{variant{"1"}, variant::row({variant{static_cast(1)}})}})}), + variant::array({variant::map( + {{variant{"2"}, + variant::row({variant{static_cast(2)}})}})})}; + testSetComplex( + ARRAY(MAP(VARCHAR(), ROW({BIGINT()}))), variants); +} + +TEST_F(ConstrainedGeneratorsTest, json) { + const TypePtr type = ARRAY(MAP(DOUBLE(), ROW({BIGINT()}))); + std::unique_ptr generator = + std::make_unique( + 0, + JSON(), + std::make_unique>(0, type)); + + const uint32_t kIterations = 1000; + const auto& opts = generator->serializationOptions(); + for (uint32_t i = 0; i < kIterations; ++i) { + auto value = generator->generate(); + EXPECT_TRUE(value.hasValue()); + EXPECT_EQ(value.kind(), TypeKind::VARCHAR); + folly::dynamic json; + EXPECT_NO_THROW( + json = folly::parseJson(value.value(), opts)); + EXPECT_TRUE(json.isArray()); + } +} + +TEST_F(ConstrainedGeneratorsTest, generateVectors) { + testGenerateVectorsPrimitive(BIGINT(), variant(0)); + testGenerateVectorsPrimitive(VARCHAR(), variant("")); + + testGenerateVectorsComplex( + ARRAY(ROW({MAP(VARCHAR(), BIGINT())}))); + testGenerateVectorsComplex( + MAP(ARRAY(BIGINT()), ROW({VARCHAR()}))); +} + +} // namespace facebook::velox::fuzzer::test diff --git a/velox/vector/fuzzer/Utils.cpp b/velox/vector/fuzzer/Utils.cpp index 805fcf1063d73..320157b064e08 100644 --- a/velox/vector/fuzzer/Utils.cpp +++ b/velox/vector/fuzzer/Utils.cpp @@ -16,13 +16,15 @@ #include "velox/vector/fuzzer/Utils.h" -namespace facebook::velox::generator_spec_utils { +namespace facebook::velox { bool coinToss(FuzzerGenerator& rng, double threshold) { static std::uniform_real_distribution<> dist(0.0, 1.0); return dist(rng) < threshold; } +namespace generator_spec_utils { + vector_size_t getRandomIndex(FuzzerGenerator& rng, vector_size_t maxIndex) { std::uniform_int_distribution indexGenerator( 0, maxIndex); // generates index in [0, maxIndex] @@ -59,4 +61,6 @@ BufferPtr generateIndicesBuffer( return indices; } -} // namespace facebook::velox::generator_spec_utils +} // namespace generator_spec_utils + +} // namespace facebook::velox diff --git a/velox/vector/fuzzer/Utils.h b/velox/vector/fuzzer/Utils.h index 5f51c3a52024f..89562c7de1720 100644 --- a/velox/vector/fuzzer/Utils.h +++ b/velox/vector/fuzzer/Utils.h @@ -23,10 +23,54 @@ namespace facebook::velox { using FuzzerGenerator = std::mt19937; -namespace generator_spec_utils { +enum UTF8CharList { + ASCII = 0, // Ascii character set. + UNICODE_CASE_SENSITIVE = 1, // Unicode scripts that support case. + EXTENDED_UNICODE = 2, // Extended Unicode: Arabic, Devanagiri etc + MATHEMATICAL_SYMBOLS = 3 // Mathematical Symbols. +}; + +/// Unicode character ranges. Ensure the vector indexes match the UTF8CharList +/// enum values. +/// +/// Source: https://jrgraphix.net/research/unicode_blocks.php +static const std::vector>> + kUTFChatSets{ + // UTF8CharList::ASCII + { + {33, 127}, // All ASCII printable chars. + }, + // UTF8CharList::UNICODE_CASE_SENSITIVE + { + {u'\u0020', u'\u007F'}, // Basic Latin. + {u'\u0400', u'\u04FF'}, // Cyrillic. + }, + // UTF8CharList::EXTENDED_UNICODE + { + {u'\u03F0', u'\u03FF'}, // Greek. + {u'\u0100', u'\u017F'}, // Latin Extended A. + {u'\u0600', u'\u06FF'}, // Arabic. + {u'\u0900', u'\u097F'}, // Devanagari. + {u'\u0600', u'\u06FF'}, // Hebrew. + {u'\u3040', u'\u309F'}, // Hiragana. + {u'\u2000', u'\u206F'}, // Punctuation. + {u'\u2070', u'\u209F'}, // Sub/Super Script. + {u'\u20A0', u'\u20CF'}, // Currency. + }, + // UTF8CharList::MATHEMATICAL_SYMBOLS + { + {u'\u2200', u'\u22FF'}, // Math Operators. + {u'\u2150', u'\u218F'}, // Number Forms. + {u'\u25A0', u'\u25FF'}, // Geometric Shapes. + {u'\u27C0', u'\u27EF'}, // Math Symbols. + {u'\u2A00', u'\u2AFF'}, // Supplemental. + }, + }; bool coinToss(FuzzerGenerator& rng, double threshold); +namespace generator_spec_utils { + vector_size_t getRandomIndex(FuzzerGenerator& rng, vector_size_t maxIndex); BufferPtr generateNullsBuffer( diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index 56cd03a27c895..cd62a3e0ef98b 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -22,6 +22,7 @@ #include #include "velox/common/base/Exceptions.h" +#include "velox/experimental/fuzzer_input_generator/ConstrainedVectorGenerator.h" #include "velox/type/Timestamp.h" #include "velox/vector/BaseVector.h" #include "velox/vector/FlatVector.h" @@ -196,42 +197,6 @@ int128_t randLongDecimal(const TypePtr& type, FuzzerGenerator& rng) { return rand(rng) % DecimalUtil::kPowersOfTen[precision]; } -/// Unicode character ranges. Ensure the vector indexes match the UTF8CharList -/// enum values. -/// -/// Source: https://jrgraphix.net/research/unicode_blocks.php -const std::vector>> kUTFChatSets{ - // UTF8CharList::ASCII - { - {33, 127}, // All ASCII printable chars. - }, - // UTF8CharList::UNICODE_CASE_SENSITIVE - { - {u'\u0020', u'\u007F'}, // Basic Latin. - {u'\u0400', u'\u04FF'}, // Cyrillic. - }, - // UTF8CharList::EXTENDED_UNICODE - { - {u'\u03F0', u'\u03FF'}, // Greek. - {u'\u0100', u'\u017F'}, // Latin Extended A. - {u'\u0600', u'\u06FF'}, // Arabic. - {u'\u0900', u'\u097F'}, // Devanagari. - {u'\u0600', u'\u06FF'}, // Hebrew. - {u'\u3040', u'\u309F'}, // Hiragana. - {u'\u2000', u'\u206F'}, // Punctuation. - {u'\u2070', u'\u209F'}, // Sub/Super Script. - {u'\u20A0', u'\u20CF'}, // Currency. - }, - // UTF8CharList::MATHEMATICAL_SYMBOLS - { - {u'\u2200', u'\u22FF'}, // Math Operators. - {u'\u2150', u'\u218F'}, // Number Forms. - {u'\u25A0', u'\u25FF'}, // Geometric Shapes. - {u'\u27C0', u'\u27EF'}, // Math Symbols. - {u'\u2A00', u'\u2AFF'}, // Supplemental. - }, -}; - FOLLY_ALWAYS_INLINE char16_t getRandomChar( FuzzerGenerator& rng, const std::vector>& charSet) { @@ -278,7 +243,13 @@ VectorPtr fuzzConstantPrimitiveImpl( const TypePtr& type, vector_size_t size, FuzzerGenerator& rng, - const VectorFuzzer::Options& opts) { + const VectorFuzzer::Options& opts, + const std::shared_ptr& customGenerator) { + if (customGenerator) { + return fuzzer::ConstrainedVectorGenerator::generateConstant( + customGenerator, size, pool); + } + using TCpp = typename TypeTraits::NativeType; if constexpr (std::is_same_v) { std::wstring_convert, char16_t> converter; @@ -397,11 +368,16 @@ VectorPtr VectorFuzzer::fuzzNotNull(const TypePtr& type, vector_size_t size) { return fuzz(type, size); } -VectorPtr VectorFuzzer::fuzz(const TypePtr& type) { - return fuzz(type, opts_.vectorSize); +VectorPtr VectorFuzzer::fuzz( + const TypePtr& type, + const std::shared_ptr& customGenerator) { + return fuzz(type, opts_.vectorSize, customGenerator); } -VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) { +VectorPtr VectorFuzzer::fuzz( + const TypePtr& type, + vector_size_t size, + const std::shared_ptr& customGenerator) { VectorPtr vector; vector_size_t vectorSize = size; @@ -414,13 +390,17 @@ VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) { // 20% chance of adding a constant vector. if (coinToss(0.2)) { - vector = fuzzConstant(type, vectorSize); - } else if (type->isPrimitiveType()) { - vector = fuzzFlatPrimitive(type, vectorSize); + vector = fuzzConstant(type, vectorSize, customGenerator); } else if (type->isOpaque()) { vector = fuzzFlatOpaque(type, vectorSize); } else { - vector = fuzzComplex(type, vectorSize); + if (customGenerator) { + vector = fuzzer::ConstrainedVectorGenerator::generateFlat( + customGenerator, vectorSize, pool_); + } else { + vector = type->isPrimitiveType() ? fuzzFlatPrimitive(type, vectorSize) + : fuzzComplex(type, vectorSize); + } } if (vectorSize > size) { @@ -452,11 +432,16 @@ VectorPtr VectorFuzzer::fuzz(const GeneratorSpec& generatorSpec) { return generatorSpec.generateData(rng_, pool_, opts_.vectorSize); } -VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type) { - return fuzzConstant(type, opts_.vectorSize); +VectorPtr VectorFuzzer::fuzzConstant( + const TypePtr& type, + const std::shared_ptr& customGenerator) { + return fuzzConstant(type, opts_.vectorSize, customGenerator); } -VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) { +VectorPtr VectorFuzzer::fuzzConstant( + const TypePtr& type, + vector_size_t size, + const std::shared_ptr& customGenerator) { // For constants, there are two possible cases: // - generate a regular constant vector (only for primitive types). // - generate a random vector and wrap it using a constant vector. @@ -475,7 +460,8 @@ VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) { type, size, rng_, - opts_); + opts_, + customGenerator); } } @@ -496,9 +482,10 @@ VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) { opts_.maxConstantContainerSize.value(), opts_.containerLength); opts_.complexElementsMaxSize = std::min( opts_.maxConstantContainerSize.value(), opts_.complexElementsMaxSize); + // todo: incorporate this into customGenerator. } return BaseVector::wrapInConstant( - size, constantIndex, fuzz(type, innerVectorSize)); + size, constantIndex, fuzz(type, innerVectorSize, customGenerator)); } VectorPtr VectorFuzzer::fuzzFlat(const TypePtr& type) { diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 8c55c9d534422..b2a1cc8c7255e 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -19,20 +19,15 @@ #include #include +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" #include "velox/type/Type.h" #include "velox/vector/BaseVector.h" #include "velox/vector/ComplexVector.h" #include "velox/vector/fuzzer/GeneratorSpec.h" +#include "velox/vector/fuzzer/Utils.h" namespace facebook::velox { -enum UTF8CharList { - ASCII = 0, // Ascii character set. - UNICODE_CASE_SENSITIVE = 1, // Unicode scripts that support case. - EXTENDED_UNICODE = 2, // Extended Unicode: Arabic, Devanagiri etc - MATHEMATICAL_SYMBOLS = 3 // Mathematical Symbols. -}; - struct DataSpec { bool includeNaN; bool includeInfinity; @@ -179,8 +174,15 @@ class VectorFuzzer { /// Returns a "fuzzed" vector, containing randomized data, nulls, and indices /// vector (dictionary). Returns a vector containing `opts_.vectorSize` or /// `size` elements. - VectorPtr fuzz(const TypePtr& type); - VectorPtr fuzz(const TypePtr& type, vector_size_t size); + VectorPtr fuzz( + const TypePtr& type, + const std::shared_ptr& customGenerator = + nullptr); + VectorPtr fuzz( + const TypePtr& type, + vector_size_t size, + const std::shared_ptr& customGenerator = + nullptr); /// Returns a "fuzzed" vector containing randomized data customized according /// to generatorSpec. @@ -204,8 +206,15 @@ class VectorFuzzer { /// Returns a random constant vector (which could be a null constant). Returns /// a vector with size set to `opts_.vectorSize` or 'size'. - VectorPtr fuzzConstant(const TypePtr& type); - VectorPtr fuzzConstant(const TypePtr& type, vector_size_t size); + VectorPtr fuzzConstant( + const TypePtr& type, + const std::shared_ptr& customGenerator = + nullptr); + VectorPtr fuzzConstant( + const TypePtr& type, + vector_size_t size, + const std::shared_ptr& customGenerator = + nullptr); /// Wraps `vector` using a randomized indices vector, returning a /// DictionaryVector which has same number of indices as the underlying diff --git a/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp b/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp index 8cad48c4a74e6..d66c68590b108 100644 --- a/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp +++ b/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp @@ -19,6 +19,8 @@ #include #include "velox/common/memory/Memory.h" +#include "velox/experimental/fuzzer_input_generator/ConstrainedGenerators.h" +#include "velox/functions/prestosql/types/JsonType.h" #include "velox/vector/DictionaryVector.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -26,6 +28,9 @@ using namespace facebook::velox; namespace { +using facebook::velox::fuzzer::JsonInputGenerator; +using facebook::velox::fuzzer::RandomInputGenerator; + class VectorFuzzerTest : public testing::Test { public: static void SetUpTestCase() { @@ -940,4 +945,32 @@ TEST_F(VectorFuzzerTest, randOrderableType) { ASSERT_TRUE(fuzzer.randOrderableType()->isOrderable()); } } + +TEST_F(VectorFuzzerTest, jsonConstrained) { + VectorFuzzer::Options opts; + VectorFuzzer fuzzer(opts, pool()); + + const TypePtr type = ARRAY(ROW({BIGINT()})); + std::shared_ptr generator = + std::make_shared( + 0, + JSON(), + std::make_unique>(0, type)); + + const uint32_t kSize = 1000; + const auto& jsonOpts = generator->serializationOptions(); + DecodedVector decoded; + for (auto i = 0; i < 10; ++i) { + auto vector = fuzzer.fuzz(JSON(), kSize, generator); + VELOX_CHECK_NE(vector, nullptr); + VELOX_CHECK_EQ(vector->type()->kind(), TypeKind::VARCHAR); + decoded.decode(*vector, SelectivityVector(kSize)); + for (auto j = 0; j < kSize; ++j) { + std::string value = decoded.valueAt(j); + folly::dynamic json; + EXPECT_NO_THROW(json = folly::parseJson(value, jsonOpts)); + EXPECT_TRUE(json.isArray()); + } + } +} } // namespace