diff --git a/velox/docs/develop/testing/fuzzer.rst b/velox/docs/develop/testing/fuzzer.rst index 072a3bd0a748..b6c9c9c05280 100644 --- a/velox/docs/develop/testing/fuzzer.rst +++ b/velox/docs/develop/testing/fuzzer.rst @@ -369,7 +369,7 @@ ExpressionRunner supports the following flags: * ``--complex_constant_path`` optional path to complex constants that aren't accurately expressable in SQL (Array, Map, Structs, ...). This is used with SQL file to reproduce the exact expression, not needed when the expression doesn't contain complex constants. -* ``--lazy_column_list_path`` optional path for the file stored on-disk which contains a vector of column indices that specify which columns of the input row vector should be wrapped in lazy. This is used when the failing test included input columns that were lazy vector. +* ``--input_row_metadata_path`` optional path for the file stored on-disk which contains a struct containing input row metadata. This includes columns in the input row vector to be wrapped in a lazy vector and/or dictionary encoded. It may also contain a dictionary peel for columns requiring dictionary encoding. This is used when the failing test included input columns that were lazy vectors and/or had columns wrapped with a common dictionary wrap. * ``--result_path`` optional path to result vector that was created by the Fuzzer. Result vector is used to reproduce cases where Fuzzer passes dirty vectors to expression evaluation as a result buffer. This ensures that functions are implemented correctly, taking into consideration dirty result buffer. diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp index f37133ca6eda..ddae30f11f0e 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp @@ -115,20 +115,39 @@ ExpressionFuzzerVerifier::ExpressionFuzzerVerifier( } } -std::vector ExpressionFuzzerVerifier::generateLazyColumnIds( +InputRowMetadata ExpressionFuzzerVerifier::generateInputRowMetadata( const RowVectorPtr& rowVector, VectorFuzzer& vectorFuzzer) { - std::vector columnsToWrapInLazy; - if (options_.lazyVectorGenerationRatio > 0) { - for (int idx = 0; idx < rowVector->childrenSize(); idx++) { - VELOX_CHECK_NOT_NULL(rowVector->childAt(idx)); - if (vectorFuzzer.coinToss(options_.lazyVectorGenerationRatio)) { - columnsToWrapInLazy.push_back( - vectorFuzzer.coinToss(0.8) ? idx : -1 * idx); - } + InputRowMetadata inputRowMetadata; + if (options_.commonDictionaryWrapRatio <= 0 && + options_.lazyVectorGenerationRatio <= 0) { + return inputRowMetadata; + } + + bool wrapInCommonDictionary = + vectorFuzzer.coinToss(options_.commonDictionaryWrapRatio); + for (int idx = 0; idx < rowVector->childrenSize(); idx++) { + const auto& child = rowVector->childAt(idx); + VELOX_CHECK_NOT_NULL(child); + if (child->encoding() != VectorEncoding::Simple::DICTIONARY && + wrapInCommonDictionary) { + inputRowMetadata.columnsToWrapInCommonDictionary.push_back(idx); } + if (vectorFuzzer.coinToss(options_.lazyVectorGenerationRatio)) { + inputRowMetadata.columnsToWrapInLazy.push_back( + vectorFuzzer.coinToss(0.8) ? idx : -1 * idx); + } + } + // Skip wrapping in common dictionary if there is only one column. + if (inputRowMetadata.columnsToWrapInCommonDictionary.size() > 1) { + auto inputSize = rowVector->size(); + inputRowMetadata.commonDictionaryIndices = + vectorFuzzer.fuzzIndices(inputSize, inputSize); + inputRowMetadata.commonDictionaryNulls = vectorFuzzer.fuzzNulls(inputSize); + } else { + inputRowMetadata.columnsToWrapInCommonDictionary.clear(); } - return columnsToWrapInLazy; + return inputRowMetadata; } void ExpressionFuzzerVerifier::reSeed() { @@ -233,7 +252,7 @@ void ExpressionFuzzerVerifier::retryWithTry( std::vector plans, const RowVectorPtr& rowVector, const VectorPtr& resultVector, - const std::vector& columnsToWrapInLazy) { + const InputRowMetadata& inputRowMetadata) { // Wrap each expression tree with 'try'. std::vector tryPlans; for (auto& plan : plans) { @@ -252,7 +271,7 @@ void ExpressionFuzzerVerifier::retryWithTry( std::nullopt, resultVector ? BaseVector::copy(*resultVector) : nullptr, false, // canThrow - columnsToWrapInLazy); + inputRowMetadata); } catch (const std::exception&) { if (options_.findMinimalSubexpression) { test::computeMinimumSubExpression( @@ -261,7 +280,7 @@ void ExpressionFuzzerVerifier::retryWithTry( plans, rowVector, std::nullopt, - columnsToWrapInLazy); + inputRowMetadata); } throw; } @@ -286,7 +305,7 @@ void ExpressionFuzzerVerifier::retryWithTry( noErrorRows, resultVector ? BaseVector::copy(*resultVector) : nullptr, false, // canThrow - columnsToWrapInLazy); + inputRowMetadata); } catch (const std::exception&) { if (options_.findMinimalSubexpression) { test::computeMinimumSubExpression( @@ -295,7 +314,7 @@ void ExpressionFuzzerVerifier::retryWithTry( plans, rowVector, noErrorRows, - columnsToWrapInLazy); + inputRowMetadata); } throw; } @@ -358,7 +377,8 @@ void ExpressionFuzzerVerifier::go() { auto rowVector = fuzzInputWithRowNumber(*vectorFuzzer_, inputType); - auto columnsToWrapInLazy = generateLazyColumnIds(rowVector, *vectorFuzzer_); + InputRowMetadata inputRowMetadata = + generateInputRowMetadata(rowVector, *vectorFuzzer_); auto resultVectors = generateResultVectors(plans); ResultOrError result; @@ -370,7 +390,7 @@ void ExpressionFuzzerVerifier::go() { std::nullopt, resultVectors ? BaseVector::copy(*resultVectors) : nullptr, true, // canThrow - columnsToWrapInLazy); + inputRowMetadata); } catch (const std::exception&) { if (options_.findMinimalSubexpression) { test::computeMinimumSubExpression( @@ -379,7 +399,7 @@ void ExpressionFuzzerVerifier::go() { plans, rowVector, std::nullopt, - columnsToWrapInLazy); + inputRowMetadata); } throw; } @@ -396,7 +416,7 @@ void ExpressionFuzzerVerifier::go() { !result.unsupportedInputUncatchableError) { LOG(INFO) << "Both paths failed with compatible exceptions. Retrying expression using try()."; - retryWithTry(plans, rowVector, resultVectors, columnsToWrapInLazy); + retryWithTry(plans, rowVector, resultVectors, inputRowMetadata); } LOG(INFO) << "==============================> Done with iteration " << i; diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.h b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h index b82a7b454c42..c03de0de2b3d 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.h +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h @@ -24,6 +24,7 @@ #include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/expression/tests/ExpressionVerifier.h" #include "velox/functions/FunctionRegistry.h" +#include "velox/vector/VectorSaver.h" #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorMaker.h" @@ -95,6 +96,14 @@ class ExpressionFuzzerVerifier { // 1). double lazyVectorGenerationRatio = 0.0; + // Specifies the probability with which columns in the input row vector will + // be selected to be wrapped in a common dictionary layer (expressed as + // double from 0 to 1). Only columns that are not already dictionary encoded + // will be selected as eventually only one dictionary wrap will be allowed + // so additional wrap can be folded into the existing one. This is to + // replicate inputs coming from a filter, union, or join. + double commonDictionaryWrapRatio = 0.0; + // This sets an upper limit on the number of expression trees to generate // per step. These trees would be executed in the same ExprSet and can // re-use already generated columns and subexpressions (if re-use is @@ -164,7 +173,7 @@ class ExpressionFuzzerVerifier { std::vector plans, const RowVectorPtr& rowVector, const VectorPtr& resultVectors, - const std::vector& columnsToWrapInLazy); + const InputRowMetadata& columnsToWrapInLazy); /// If --duration_sec > 0, check if we expired the time budget. Otherwise, /// check if we expired the number of iterations (--steps). @@ -180,11 +189,15 @@ class ExpressionFuzzerVerifier { /// proportionOfTimesSelected numProcessedRows. void logStats(); - // Randomly pick columns from the input row vector to wrap in lazy. - // Negative column indices represent lazy vectors that have been preloaded - // before feeding them to the evaluator. This list is sorted on the absolute - // value of the entries. - std::vector generateLazyColumnIds( + // Generates InputRowMetadata which contains the following: + // 1. Randomly picked columns from the input row vector to wrap + // in lazy. Negative column indices represent lazy vectors that have been + // preloaded before feeding them to the evaluator. + // 2. Randomly picked columns (2 or more) from the input row vector to + // wrap in a common dictionary layer. Only columns not already dictionary + // encoded are picked. + // Note: These lists are sorted on the absolute value of the entries. + InputRowMetadata generateInputRowMetadata( const RowVectorPtr& rowVector, VectorFuzzer& vectorFuzzer); diff --git a/velox/expression/fuzzer/FuzzerRunner.cpp b/velox/expression/fuzzer/FuzzerRunner.cpp index 22ca6c957a74..7c06a5ac6af6 100644 --- a/velox/expression/fuzzer/FuzzerRunner.cpp +++ b/velox/expression/fuzzer/FuzzerRunner.cpp @@ -65,6 +65,14 @@ DEFINE_double( "vector will be selected to be wrapped in lazy encoding " "(expressed as double from 0 to 1)."); +DEFINE_double( + common_dictionary_wraps_generation_ratio, + 0.0, + "Specifies the probability with which columns in the input row " + "vector will be selected to be wrapped in a common dictionary wrap " + "(expressed as double from 0 to 1). Only columns not already encoded " + "will be considered."); + DEFINE_int32( max_expression_trees_per_step, 1, @@ -207,6 +215,8 @@ ExpressionFuzzerVerifier::Options getExpressionFuzzerVerifierOptions( opts.reproPersistPath = FLAGS_repro_persist_path; opts.persistAndRunOnce = FLAGS_persist_and_run_once; opts.lazyVectorGenerationRatio = FLAGS_lazy_vector_generation_ratio; + opts.commonDictionaryWrapRatio = + FLAGS_common_dictionary_wraps_generation_ratio; opts.maxExpressionTreesPerStep = FLAGS_max_expression_trees_per_step; opts.vectorFuzzerOptions = getVectorFuzzerOptions(); opts.expressionFuzzerOptions = getExpressionFuzzerOptions( diff --git a/velox/expression/fuzzer/FuzzerToolkit.cpp b/velox/expression/fuzzer/FuzzerToolkit.cpp index 292f4619bb66..41b261de7cfa 100644 --- a/velox/expression/fuzzer/FuzzerToolkit.cpp +++ b/velox/expression/fuzzer/FuzzerToolkit.cpp @@ -14,9 +14,30 @@ * limitations under the License. */ #include "velox/expression/fuzzer/FuzzerToolkit.h" +#include "velox/vector/VectorSaver.h" namespace facebook::velox::fuzzer { +namespace { +template +void saveStdVector(const std::vector& list, std::ostream& out) { + // Size of the vector + size_t size = list.size(); + out.write((char*)&(size), sizeof(size)); + out.write( + reinterpret_cast(list.data()), list.size() * sizeof(T)); +} + +template +std::vector restoreStdVector(std::istream& in) { + size_t size; + in.read((char*)&size, sizeof(size)); + std::vector vec(size); + in.read(reinterpret_cast(vec.data()), size * sizeof(T)); + return vec; +} +} // namespace + std::string CallableSignature::toString() const { std::string buf = name; buf.append("( "); @@ -137,4 +158,60 @@ void compareVectors( LOG(INFO) << "Two vectors match."; } +RowVectorPtr applyCommonDictionaryLayer( + const RowVectorPtr& rowVector, + const InputRowMetadata& inputRowMetadata) { + if (inputRowMetadata.columnsToWrapInCommonDictionary.empty()) { + return rowVector; + } + auto size = rowVector->size(); + auto& nulls = inputRowMetadata.commonDictionaryNulls; + auto& indices = inputRowMetadata.commonDictionaryIndices; + if (nulls) { + VELOX_CHECK_LE(bits::nbytes(size), nulls->size()); + } + VELOX_CHECK_LE(size, indices->size() / sizeof(vector_size_t)); + std::vector newInputs; + int listIndex = 0; + auto& columnsToWrap = inputRowMetadata.columnsToWrapInCommonDictionary; + for (int idx = 0; idx < rowVector->childrenSize(); idx++) { + auto& child = rowVector->childAt(idx); + VELOX_CHECK_NOT_NULL(child); + if (listIndex < columnsToWrap.size() && idx == columnsToWrap[listIndex]) { + newInputs.push_back( + BaseVector::wrapInDictionary(nulls, indices, size, child)); + listIndex++; + } else { + newInputs.push_back(child); + } + } + return std::make_shared( + rowVector->pool(), rowVector->type(), nullptr, size, newInputs); +} + +void InputRowMetadata::saveToFile(const char* filePath) const { + std::ofstream outputFile(filePath, std::ofstream::binary); + saveStdVector(columnsToWrapInLazy, outputFile); + saveStdVector(columnsToWrapInCommonDictionary, outputFile); + writeOptionalBuffer(commonDictionaryIndices, outputFile); + writeOptionalBuffer(commonDictionaryNulls, outputFile); + outputFile.close(); +} + +InputRowMetadata InputRowMetadata::restoreFromFile( + const char* filePath, + memory::MemoryPool* pool) { + InputRowMetadata ret; + std::ifstream in(filePath, std::ifstream::binary); + ret.columnsToWrapInLazy = restoreStdVector(in); + if (in.peek() != EOF) { + // this allows reading old files that only saved columnsToWrapInLazy. + ret.columnsToWrapInCommonDictionary = restoreStdVector(in); + ret.commonDictionaryIndices = readOptionalBuffer(in, pool); + ret.commonDictionaryNulls = readOptionalBuffer(in, pool); + } + in.close(); + return ret; +} + } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/FuzzerToolkit.h b/velox/expression/fuzzer/FuzzerToolkit.h index 6eb0a17f9909..d9a7e7e8a2ca 100644 --- a/velox/expression/fuzzer/FuzzerToolkit.h +++ b/velox/expression/fuzzer/FuzzerToolkit.h @@ -114,4 +114,35 @@ void compareVectors( const std::string& leftName = "left", const std::string& rightName = "right", const std::optional& rows = std::nullopt); + +struct InputRowMetadata { + // Column indices to wrap in LazyVector (in a strictly increasing order) + std::vector columnsToWrapInLazy; + + // Column indices to wrap in a common dictionary layer (in a strictly + // increasing order) + std::vector columnsToWrapInCommonDictionary; + + // Dictionary indices and nulls for the common dictionary layer. Buffers are + // null if no columns are specified in `columnsToWrapInCommonDictionary`. + BufferPtr commonDictionaryIndices; + BufferPtr commonDictionaryNulls; + + bool empty() const { + return columnsToWrapInLazy.empty() && + columnsToWrapInCommonDictionary.empty(); + } + + void saveToFile(const char* filePath) const; + static InputRowMetadata restoreFromFile( + const char* filePath, + memory::MemoryPool* pool); +}; + +// Wraps the columns in the row vector with a common dictionary layer. The +// column indices to wrap and the wrap itself is specified in +// `inputRowMetadata`. +RowVectorPtr applyCommonDictionaryLayer( + const RowVectorPtr& rowVector, + const InputRowMetadata& inputRowMetadata); } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/tests/FuzzerToolkitTest.cpp b/velox/expression/fuzzer/tests/FuzzerToolkitTest.cpp new file mode 100644 index 000000000000..8dba6a0b1d87 --- /dev/null +++ b/velox/expression/fuzzer/tests/FuzzerToolkitTest.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/expression/fuzzer/FuzzerToolkit.h" + +#include +#include "velox/exec/tests/utils/TempFilePath.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +namespace facebook::velox::fuzzer::test { +class FuzzerToolKitTest : public testing::Test, + public facebook::velox::test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + bool compareBuffers(const BufferPtr& lhs, const BufferPtr& rhs) { + if (!lhs && !rhs) { + return true; + } + if ((lhs && !rhs) || (!lhs && rhs) || lhs->size() != rhs->size()) { + return false; + } + return memcmp(lhs->as(), rhs->as(), lhs->size()) == 0; + } + + bool equals(const InputRowMetadata& lhs, const InputRowMetadata& rhs) { + return lhs.columnsToWrapInLazy == rhs.columnsToWrapInLazy && + lhs.columnsToWrapInCommonDictionary == + rhs.columnsToWrapInCommonDictionary && + compareBuffers( + lhs.commonDictionaryIndices, rhs.commonDictionaryIndices) && + compareBuffers(lhs.commonDictionaryNulls, rhs.commonDictionaryNulls); + } +}; + +TEST_F(FuzzerToolKitTest, inputRowMetadataRoundTrip) { + InputRowMetadata metadata; + metadata.columnsToWrapInLazy = {1, -2, 3, -4, 5}; + metadata.columnsToWrapInCommonDictionary = {1, 2, 3, 4, 5}; + metadata.commonDictionaryIndices = makeIndicesInReverse(5); + metadata.commonDictionaryNulls = makeNulls({true, false, true, false, true}); + + { + auto path = exec::test::TempFilePath::create(); + metadata.saveToFile(path->getPath().c_str()); + auto copy = + InputRowMetadata::restoreFromFile(path->getPath().c_str(), pool()); + ASSERT_TRUE(equals(metadata, copy)); + } + + metadata.commonDictionaryNulls = nullptr; + { + auto path = exec::test::TempFilePath::create(); + metadata.saveToFile(path->getPath().c_str()); + auto copy = + InputRowMetadata::restoreFromFile(path->getPath().c_str(), pool()); + ASSERT_TRUE(equals(metadata, copy)); + } + + metadata.columnsToWrapInCommonDictionary.clear(); + metadata.commonDictionaryIndices = nullptr; + { + auto path = exec::test::TempFilePath::create(); + metadata.saveToFile(path->getPath().c_str()); + auto copy = + InputRowMetadata::restoreFromFile(path->getPath().c_str(), pool()); + ASSERT_TRUE(equals(metadata, copy)); + } + + metadata.columnsToWrapInLazy.clear(); + metadata.commonDictionaryIndices = nullptr; + { + auto path = exec::test::TempFilePath::create(); + metadata.saveToFile(path->getPath().c_str()); + auto copy = + InputRowMetadata::restoreFromFile(path->getPath().c_str(), pool()); + ASSERT_TRUE(equals(metadata, copy)); + } +} +} // namespace facebook::velox::fuzzer::test diff --git a/velox/expression/tests/ExpressionRunner.cpp b/velox/expression/tests/ExpressionRunner.cpp index 85356e7cd1d6..7476415d506f 100644 --- a/velox/expression/tests/ExpressionRunner.cpp +++ b/velox/expression/tests/ExpressionRunner.cpp @@ -21,6 +21,7 @@ #include "velox/core/QueryCtx.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/expression/Expr.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/expression/tests/ExpressionRunner.h" #include "velox/expression/tests/ExpressionVerifier.h" #include "velox/parse/Expressions.h" @@ -114,7 +115,7 @@ void ExpressionRunner::run( const std::string& mode, vector_size_t numRows, const std::string& storeResultPath, - const std::string& lazyColumnListPath, + const std::string& inputRowMetadataPath, std::shared_ptr referenceQueryRunner, bool findMinimalSubExpression, bool useSeperatePoolForInput) { @@ -143,10 +144,10 @@ void ExpressionRunner::run( VELOX_CHECK_GT(inputVector->size(), 0, "Input vector must not be empty."); } - std::vector columnsToWrapInLazy; - if (!lazyColumnListPath.empty()) { - columnsToWrapInLazy = - restoreStdVectorFromFile(lazyColumnListPath.c_str()); + fuzzer::InputRowMetadata inputRowMetadata; + if (!inputRowMetadataPath.empty()) { + inputRowMetadata = fuzzer::InputRowMetadata::restoreFromFile( + inputRowMetadataPath.c_str(), pool.get()); } parse::registerTypeResolver(); @@ -200,7 +201,7 @@ void ExpressionRunner::run( std::nullopt, std::move(resultVector), true, - columnsToWrapInLazy); + inputRowMetadata); } catch (const std::exception&) { if (findMinimalSubExpression) { VectorFuzzer::Options options; @@ -211,16 +212,15 @@ void ExpressionRunner::run( typedExprs, inputVector, std::nullopt, - columnsToWrapInLazy); + inputRowMetadata); } throw; } } else if (mode == "common") { - if (!columnsToWrapInLazy.empty()) { - inputVector = - VectorFuzzer::fuzzRowChildrenToLazy(inputVector, columnsToWrapInLazy); - } + inputVector = VectorFuzzer::fuzzRowChildrenToLazy( + inputVector, inputRowMetadata.columnsToWrapInLazy); + inputVector = applyCommonDictionaryLayer(inputVector, inputRowMetadata); exec::ExprSet exprSet(typedExprs, &execCtx); auto results = evaluateAndPrintResults(exprSet, inputVector, rows, execCtx); if (!storeResultPath.empty()) { diff --git a/velox/expression/tests/ExpressionRunner.h b/velox/expression/tests/ExpressionRunner.h index 1be4c485567e..76b8c9a3f108 100644 --- a/velox/expression/tests/ExpressionRunner.h +++ b/velox/expression/tests/ExpressionRunner.h @@ -47,9 +47,9 @@ class ExpressionRunner { /// @param storeResultPath The path to a directory on disk where the results /// of expression or query evaluation will be stored. If empty, the results /// will not be stored. - /// @param lazyColumnListPath The path to on-disk vector of column indices - /// that specify which columns of the input row vector should be wrapped in - /// lazy. + /// @param inputRowMetadataPath The path to on-disk serialized struct that + /// contains metadata about the input row vector like the columns + /// to wrap in lazy or dictionary encoding and the dictionary wrap. /// @param findMinimalSubExpression Whether to find minimum failing /// subexpression on result mismatch. /// @param useSeperatePoolForInput Whether to use separate memory pools for @@ -70,7 +70,7 @@ class ExpressionRunner { const std::string& mode, vector_size_t numRows, const std::string& storeResultPath, - const std::string& lazyColumnListPath, + const std::string& inputRowMetadataPath, std::shared_ptr referenceQueryRunner, bool findMinimalSubExpression = false, bool useSeperatePoolForInput = true); diff --git a/velox/expression/tests/ExpressionRunnerTest.cpp b/velox/expression/tests/ExpressionRunnerTest.cpp index 34b2f3be21be..b942574d92e0 100644 --- a/velox/expression/tests/ExpressionRunnerTest.cpp +++ b/velox/expression/tests/ExpressionRunnerTest.cpp @@ -87,11 +87,12 @@ DEFINE_string( "table 't'."); DEFINE_string( - lazy_column_list_path, + input_row_metadata_path, "", - "Path for the file stored on-disk which contains a vector of column " - "indices that specify which columns of the input row vector should " - "be wrapped in lazy."); + "Path for the file stored on-disk which contains a struct containing " + "input row metadata. This includes columns in the input row vector to " + "be wrapped in a lazy vector and/or dictionary encoded. It may also " + "contain a dictionary peel for columns requiring dictionary encoding."); DEFINE_bool( use_seperate_memory_pool_for_input_vector, @@ -204,11 +205,11 @@ static void checkDirForExpectedFiles() { ? checkAndReturnFilePath( test::ExpressionVerifier::kExpressionSqlFileName, "sql_path") : FLAGS_sql_path; - FLAGS_lazy_column_list_path = FLAGS_lazy_column_list_path.empty() + FLAGS_input_row_metadata_path = FLAGS_input_row_metadata_path.empty() ? checkAndReturnFilePath( - test::ExpressionVerifier::kIndicesOfLazyColumnsFileName, - "lazy_column_list_path") - : FLAGS_lazy_column_list_path; + test::ExpressionVerifier::kInputRowMetadataFileName, + "input_row_metadata_path") + : FLAGS_input_row_metadata_path; FLAGS_complex_constant_path = FLAGS_complex_constant_path.empty() ? checkAndReturnFilePath( test::ExpressionVerifier::kComplexConstantsFileName, @@ -266,7 +267,7 @@ int main(int argc, char** argv) { FLAGS_mode, FLAGS_num_rows, FLAGS_store_result_path, - FLAGS_lazy_column_list_path, + FLAGS_input_row_metadata_path, referenceQueryRunner, FLAGS_find_minimal_subexpression, FLAGS_use_seperate_memory_pool_for_input_vector); diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp index 5efe9dd83251..f1bc21773e63 100644 --- a/velox/expression/tests/ExpressionVerifier.cpp +++ b/velox/expression/tests/ExpressionVerifier.cpp @@ -99,7 +99,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( const std::optional& rowsToVerify, VectorPtr&& resultVector, bool canThrow, - std::vector columnsToWrapInLazy) { + const InputRowMetadata& inputRowMetadata) { for (int i = 0; i < plans.size(); ++i) { LOG(INFO) << "Executing expression " << i << " : " << plans[i]->toString(); } @@ -131,7 +131,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( } if (options_.persistAndRunOnce) { persistReproInfo( - rowVector, columnsToWrapInLazy, copiedResult, sql, complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); } } @@ -180,12 +180,15 @@ fuzzer::ResultOrError ExpressionVerifier::verify( plans, execCtx_, !options_.disableConstantFolding); auto inputRowVector = rowVector; VectorPtr copiedInput; - if (!columnsToWrapInLazy.empty()) { - inputRowVector = - VectorFuzzer::fuzzRowChildrenToLazy(rowVector, columnsToWrapInLazy); + inputRowVector = VectorFuzzer::fuzzRowChildrenToLazy( + rowVector, inputRowMetadata.columnsToWrapInLazy); + inputRowVector = + applyCommonDictionaryLayer(inputRowVector, inputRowMetadata); + if (inputRowVector != rowVector) { VLOG(1) << "Modified inputs for common eval path: "; logRowVector(inputRowVector); - } else { + } + if (inputRowMetadata.columnsToWrapInLazy.empty()) { // Copy loads lazy vectors so only do this when there are no lazy inputs. copiedInput = BaseVector::copy(*inputRowVector); } @@ -215,7 +218,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( << "Common eval: VeloxRuntimeErrors other than UNSUPPORTED_INPUT_UNCATCHABLE error are not allowed."; } persistReproInfoIfNeeded( - rowVector, columnsToWrapInLazy, copiedResult, sql, complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } exceptionCommonPtr = std::current_exception(); @@ -223,7 +226,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( LOG(ERROR) << "Common eval: Exceptions other than VeloxUserError or VeloxRuntimeError of UNSUPPORTED_INPUT_UNCATCHABLE are not allowed."; persistReproInfoIfNeeded( - rowVector, columnsToWrapInLazy, copiedResult, sql, complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } @@ -232,6 +235,8 @@ fuzzer::ResultOrError ExpressionVerifier::verify( if (referenceQueryRunner_ != nullptr) { VLOG(1) << "Execute with reference DB."; auto inputRowVector = rowVector; + inputRowVector = + applyCommonDictionaryLayer(inputRowVector, inputRowMetadata); inputRowVector = reduceToSelectedRows(rowVector, rows); auto projectionPlan = makeProjectionPlan(inputRowVector, plans); auto referenceResultOrError = computeReferenceResults( @@ -289,11 +294,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( } } catch (...) { persistReproInfoIfNeeded( - rowVector, - columnsToWrapInLazy, - copiedResult, - sql, - complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } } @@ -301,17 +302,24 @@ fuzzer::ResultOrError ExpressionVerifier::verify( VLOG(1) << "Execute with simplified expression eval path."; try { exec::ExprSetSimplified exprSetSimplified(plans, execCtx_); + auto inputRowVector = + applyCommonDictionaryLayer(rowVector, inputRowMetadata); + if (inputRowVector != rowVector) { + VLOG(1) << "Modified inputs for simplified eval path: "; + logRowVector(inputRowVector); + } + exec::EvalCtx evalCtxSimplified( - execCtx_, &exprSetSimplified, rowVector.get()); + execCtx_, &exprSetSimplified, inputRowVector.get()); - auto copy = BaseVector::copy(*rowVector); + auto copy = BaseVector::copy(*inputRowVector); exprSetSimplified.eval(rows, evalCtxSimplified, simplifiedEvalResult); // Flatten the input vector as an optimization if its very deeply // nested. fuzzer::compareVectors( copy, - BaseVector::copy(*rowVector), + BaseVector::copy(*inputRowVector), "Copy of original input", "Input after simplified", rows); @@ -322,11 +330,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( LOG(ERROR) << "Simplified eval: VeloxRuntimeErrors other than UNSUPPORTED_INPUT_UNCATCHABLE error are not allowed."; persistReproInfoIfNeeded( - rowVector, - columnsToWrapInLazy, - copiedResult, - sql, - complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } exceptionSimplifiedPtr = std::current_exception(); @@ -334,7 +338,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( LOG(ERROR) << "Simplified eval: Exceptions other than VeloxUserError or VeloxRuntimeError with UNSUPPORTED_INPUT are not allowed."; persistReproInfoIfNeeded( - rowVector, columnsToWrapInLazy, copiedResult, sql, complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } @@ -371,7 +375,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( } } catch (...) { persistReproInfoIfNeeded( - rowVector, columnsToWrapInLazy, copiedResult, sql, complexConstants); + rowVector, inputRowMetadata, copiedResult, sql, complexConstants); throw; } } @@ -399,7 +403,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( void ExpressionVerifier::persistReproInfoIfNeeded( const VectorPtr& inputVector, - const std::vector& columnsToWrapInLazy, + const InputRowMetadata& inputRowMetadata, const VectorPtr& resultVector, const std::string& sql, const std::vector& complexConstants) { @@ -407,18 +411,18 @@ void ExpressionVerifier::persistReproInfoIfNeeded( LOG(INFO) << "Skipping persistence because repro path is empty."; } else if (!options_.persistAndRunOnce) { persistReproInfo( - inputVector, columnsToWrapInLazy, resultVector, sql, complexConstants); + inputVector, inputRowMetadata, resultVector, sql, complexConstants); } } void ExpressionVerifier::persistReproInfo( const VectorPtr& inputVector, - std::vector columnsToWrapInLazy, + const InputRowMetadata& inputRowMetadata, const VectorPtr& resultVector, const std::string& sql, const std::vector& complexConstants) { std::string inputPath; - std::string lazyListPath; + std::string inputRowMetadataPath; std::string resultPath; std::string sqlPath; std::string complexConstantsPath; @@ -443,13 +447,13 @@ void ExpressionVerifier::persistReproInfo( } // Saving the list of column indices that are to be wrapped in lazy. - if (!columnsToWrapInLazy.empty()) { - lazyListPath = - fmt::format("{}/{}", dirPath->c_str(), kIndicesOfLazyColumnsFileName); + if (!inputRowMetadata.empty()) { + inputRowMetadataPath = + fmt::format("{}/{}", dirPath->c_str(), kInputRowMetadataFileName); try { - saveStdVectorToFile(columnsToWrapInLazy, lazyListPath.c_str()); + inputRowMetadata.saveToFile(inputRowMetadataPath.c_str()); } catch (std::exception& e) { - lazyListPath = e.what(); + inputRowMetadataPath = e.what(); } } @@ -492,8 +496,8 @@ void ExpressionVerifier::persistReproInfo( ss << " --result_path " << resultPath; } ss << " --sql_path " << sqlPath; - if (!columnsToWrapInLazy.empty()) { - ss << " --lazy_column_list_path " << lazyListPath; + if (!inputRowMetadata.empty()) { + ss << " --input_row_metadata_path " << inputRowMetadataPath; } if (!complexConstants.empty()) { ss << " --complex_constant_path " << complexConstantsPath; @@ -514,12 +518,12 @@ class MinimalSubExpressionFinder { core::TypedExprPtr plan, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy) { - if (verifyWithResults(plan, rowVector, rowsToVerify, columnsToWrapInLazy)) { + const InputRowMetadata& inputRowMetadata) { + if (verifyWithResults(plan, rowVector, rowsToVerify, inputRowMetadata)) { errorExit("Retry should have failed"); } - bool minimalFound = findMinimalRecursive( - plan, rowVector, rowsToVerify, columnsToWrapInLazy); + bool minimalFound = + findMinimalRecursive(plan, rowVector, rowsToVerify, inputRowMetadata); if (minimalFound) { errorExit("Found minimal failing expression."); } else { @@ -540,14 +544,14 @@ class MinimalSubExpressionFinder { core::TypedExprPtr plan, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy) { + const InputRowMetadata& inputRowMetadata) { bool anyFailed = false; for (auto& input : plan->inputs()) { if (!verifyWithResults( - input, rowVector, rowsToVerify, columnsToWrapInLazy)) { + input, rowVector, rowsToVerify, inputRowMetadata)) { anyFailed = true; bool minimalFound = findMinimalRecursive( - input, rowVector, rowsToVerify, columnsToWrapInLazy); + input, rowVector, rowsToVerify, inputRowMetadata); if (minimalFound) { return true; } @@ -556,8 +560,8 @@ class MinimalSubExpressionFinder { if (!anyFailed) { LOG(INFO) << "Failed with all children succeeding: " << plan->toString(); // Re-running the minimum failed. Put breakpoint here to debug. - verifyWithResults(plan, rowVector, rowsToVerify, columnsToWrapInLazy); - if (!columnsToWrapInLazy.empty()) { + verifyWithResults(plan, rowVector, rowsToVerify, inputRowMetadata); + if (!inputRowMetadata.columnsToWrapInLazy.empty()) { LOG(INFO) << "Trying without lazy:"; if (verifyWithResults(plan, rowVector, rowsToVerify, {})) { LOG(INFO) << "Minimal failure succeeded without lazy vectors"; @@ -577,15 +581,15 @@ class MinimalSubExpressionFinder { core::TypedExprPtr plan, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy) { + const InputRowMetadata& inputRowMetadata) { VectorPtr result; LOG(INFO) << "Running with empty results vector :" << plan->toString(); bool emptyResult = - verifyPlan(plan, rowVector, rowsToVerify, columnsToWrapInLazy, result); + verifyPlan(plan, rowVector, rowsToVerify, inputRowMetadata, result); LOG(INFO) << "Running with non empty vector :" << plan->toString(); result = vectorFuzzer_.fuzzFlat(plan->type()); bool filledResult = - verifyPlan(plan, rowVector, rowsToVerify, columnsToWrapInLazy, result); + verifyPlan(plan, rowVector, rowsToVerify, inputRowMetadata, result); if (emptyResult != filledResult) { LOG(ERROR) << fmt::format( "Different results for empty vs populated ! Empty result = {} filledResult = {}", @@ -601,7 +605,7 @@ class MinimalSubExpressionFinder { core::TypedExprPtr plan, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy, + const InputRowMetadata& inputRowMetadata, VectorPtr results) { // Turn off unnecessary logging. FLAGS_minloglevel = 2; @@ -614,7 +618,7 @@ class MinimalSubExpressionFinder { rowsToVerify, results ? BaseVector::copy(*results) : nullptr, true, // canThrow - columnsToWrapInLazy); + inputRowMetadata); } catch (const std::exception&) { success = false; } @@ -633,7 +637,7 @@ void computeMinimumSubExpression( const std::vector& plans, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy) { + const InputRowMetadata& inputRowMetadata) { auto finder = MinimalSubExpressionFinder(std::move(minimalVerifier), fuzzer); if (plans.size() > 1) { LOG(INFO) @@ -645,7 +649,7 @@ void computeMinimumSubExpression( LOG(INFO) << "============================================"; LOG(INFO) << "Finding minimal subexpression for plan:" << plan->toString(); finder.findMinimalExpression( - plan, rowVector, rowsToVerify, columnsToWrapInLazy); + plan, rowVector, rowsToVerify, inputRowMetadata); LOG(INFO) << "============================================"; } } diff --git a/velox/expression/tests/ExpressionVerifier.h b/velox/expression/tests/ExpressionVerifier.h index 4aaa2f4baa29..85d6771d6a2f 100644 --- a/velox/expression/tests/ExpressionVerifier.h +++ b/velox/expression/tests/ExpressionVerifier.h @@ -24,11 +24,13 @@ #include "velox/type/Type.h" #include "velox/vector/BaseVector.h" #include "velox/vector/ComplexVector.h" +#include "velox/vector/VectorSaver.h" #include "velox/vector/fuzzer/VectorFuzzer.h" namespace facebook::velox::test { using exec::test::ReferenceQueryRunner; +using facebook::velox::fuzzer::InputRowMetadata; struct ExpressionVerifierOptions { bool disableConstantFolding{false}; @@ -41,8 +43,8 @@ class ExpressionVerifier { // File names used to persist data required for reproducing a failed test // case. static constexpr const std::string_view kInputVectorFileName = "input_vector"; - static constexpr const std::string_view kIndicesOfLazyColumnsFileName = - "indices_of_lazy_columns"; + static constexpr const std::string_view kInputRowMetadataFileName = + "input_row_metadata"; static constexpr const std::string_view kResultVectorFileName = "result_vector"; static constexpr const std::string_view kExpressionSqlFileName = "sql"; @@ -80,14 +82,14 @@ class ExpressionVerifier { const std::optional& rowsToVerify, VectorPtr&& resultVector, bool canThrow, - std::vector columnsToWarpInLazy = {}); + const InputRowMetadata& inputRowMetadata = {}); private: // Utility method used to serialize the relevant data required to repro a // crash. void persistReproInfo( const VectorPtr& inputVector, - std::vector columnsToWarpInLazy, + const InputRowMetadata& inputRowMetadata, const VectorPtr& resultVector, const std::string& sql, const std::vector& complexConstants); @@ -97,7 +99,7 @@ class ExpressionVerifier { // otherwise. void persistReproInfoIfNeeded( const VectorPtr& inputVector, - const std::vector& columnsToWarpInLazy, + const InputRowMetadata& inputRowMetadata, const VectorPtr& resultVector, const std::string& sql, const std::vector& complexConstants); @@ -117,5 +119,5 @@ void computeMinimumSubExpression( const std::vector& plans, const RowVectorPtr& rowVector, const std::optional& rowsToVerify, - const std::vector& columnsToWrapInLazy); + const InputRowMetadata& inputRowMetadata); } // namespace facebook::velox::test diff --git a/velox/vector/VectorSaver.cpp b/velox/vector/VectorSaver.cpp index e419d0586f12..5c8e5542d3f7 100644 --- a/velox/vector/VectorSaver.cpp +++ b/velox/vector/VectorSaver.cpp @@ -20,6 +20,11 @@ namespace facebook::velox { +void writeBuffer(const BufferPtr& buffer, std::ostream& out); +void writeOptionalBuffer(const BufferPtr& buffer, std::ostream& out); +BufferPtr readBuffer(std::istream& in, memory::MemoryPool* pool); +BufferPtr readOptionalBuffer(std::istream& in, memory::MemoryPool* pool); + namespace { enum class Encoding : int8_t { @@ -102,37 +107,6 @@ Encoding readEncoding(std::istream& in) { } } -void writeBuffer(const BufferPtr& buffer, std::ostream& out) { - write(buffer->size(), out); - out.write(buffer->as(), buffer->size()); -} - -void writeOptionalBuffer(const BufferPtr& buffer, std::ostream& out) { - if (buffer) { - write(true, out); - writeBuffer(buffer, out); - } else { - write(false, out); - } -} - -BufferPtr readBuffer(std::istream& in, memory::MemoryPool* pool) { - auto numBytes = read(in); - auto buffer = AlignedBuffer::allocate(numBytes, pool); - auto rawBuffer = buffer->asMutable(); - in.read(rawBuffer, numBytes); - return buffer; -} - -BufferPtr readOptionalBuffer(std::istream& in, memory::MemoryPool* pool) { - bool hasBuffer = read(in); - if (hasBuffer) { - return readBuffer(in, pool); - } - - return nullptr; -} - template VectorPtr createFlat( const TypePtr& type, @@ -572,9 +546,40 @@ VectorPtr readLazyVector( return std::make_shared( pool, type, size, std::make_unique(loadedVector)); } - } // namespace +void writeBuffer(const BufferPtr& buffer, std::ostream& out) { + VELOX_CHECK_NOT_NULL(buffer); + write(buffer->size(), out); + out.write(buffer->as(), buffer->size()); +} + +void writeOptionalBuffer(const BufferPtr& buffer, std::ostream& out) { + if (buffer) { + write(true, out); + writeBuffer(buffer, out); + } else { + write(false, out); + } +} + +BufferPtr readBuffer(std::istream& in, memory::MemoryPool* pool) { + auto numBytes = read(in); + auto buffer = AlignedBuffer::allocate(numBytes, pool); + auto rawBuffer = buffer->asMutable(); + in.read(rawBuffer, numBytes); + return buffer; +} + +BufferPtr readOptionalBuffer(std::istream& in, memory::MemoryPool* pool) { + bool hasBuffer = read(in); + if (hasBuffer) { + return readBuffer(in, pool); + } + + return nullptr; +} + void saveType(const TypePtr& type, std::ostream& out) { auto serialized = toJson(type->serialize()); write(serialized, out); @@ -717,39 +722,6 @@ std::optional generateFolderPath( return path; } -template -void saveStdVectorToFile(const std::vector& list, const char* filePath) { - std::ofstream outputFile(filePath, std::ofstream::binary); - // Size of the vector - write(list.size(), outputFile); - - outputFile.write( - reinterpret_cast(list.data()), list.size() * sizeof(T)); - outputFile.close(); -} - -template void saveStdVectorToFile( - const std::vector& list, - const char* filePath); -template void saveStdVectorToFile( - const std::vector& list, - const char* filePath); - -template -std::vector restoreStdVectorFromFile(const char* filePath) { - std::ifstream in(filePath, std::ifstream::binary); - auto size = read(in); - std::vector vec(size); - - in.read(reinterpret_cast(vec.data()), size * sizeof(T)); - in.close(); - return vec; -} - -template std::vector restoreStdVectorFromFile( - const char* filePath); -template std::vector restoreStdVectorFromFile(const char* filePath); - void saveSelectivityVector(const SelectivityVector& rows, std::ostream& out) { auto range = rows.asRange(); @@ -773,7 +745,6 @@ SelectivityVector restoreSelectivityVector(std::istream& in) { rows.setFromBits(reinterpret_cast(bits.data()), size); return rows; } - } // namespace facebook::velox template <> diff --git a/velox/vector/VectorSaver.h b/velox/vector/VectorSaver.h index 39a25c374cbe..63f47fbeba7f 100644 --- a/velox/vector/VectorSaver.h +++ b/velox/vector/VectorSaver.h @@ -53,15 +53,6 @@ VectorPtr restoreVectorFromFile(const char* filePath, memory::MemoryPool* pool); /// Reads a string from a file stored by saveStringToFile() method std::string restoreStringFromFile(const char* filePath); -// Write the vector to a file. Contents would include the size of the list -// followed by all the values. -template -void saveStdVectorToFile(const std::vector& list, const char* filePath); - -// Reads a std::vector from a file stored by saveStdVectorToFile() method. -template -std::vector restoreStdVectorFromFile(const char* filePath); - /// Serializes a SelectivityVector into binary format and writes it to the /// provided output stream. void saveSelectivityVector(const SelectivityVector& rows, std::ostream& out); @@ -70,4 +61,20 @@ void saveSelectivityVector(const SelectivityVector& rows, std::ostream& out); /// the provided input stream. SelectivityVector restoreSelectivityVector(std::istream& in); +/// Serializes a BufferPtr into binary format and writes it to the +/// provided output stream. 'buffer' must be non-null. +void writeBuffer(const BufferPtr& buffer, std::ostream& out); + +/// Serializes a optional BufferPtr into binary format and writes it to the +/// provided output stream. +void writeOptionalBuffer(const BufferPtr& buffer, std::ostream& out); + +/// Deserializes a BufferPtr serialized by 'writeBuffer' from the provided +/// input stream. +BufferPtr readBuffer(std::istream& in, memory::MemoryPool* pool); + +/// Deserializes a optional BufferPtr serialized by 'writeOptionalBuffer' from +/// the provided input stream. +BufferPtr readOptionalBuffer(std::istream& in, memory::MemoryPool* pool); + } // namespace facebook::velox diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 8c55c9d53442..6ebbf03dc0a6 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -328,7 +328,7 @@ class VectorFuzzer { RowVectorPtr rowVector, const std::vector& columnsToWrapInLazy); - /// Generate a random null buffer. + /// Generate a random null buffer. Can return nullptr if no nulls are set. BufferPtr fuzzNulls(vector_size_t size); /// Generate a random indices buffer of 'size' with maximum possible index diff --git a/velox/vector/tests/VectorSaverTest.cpp b/velox/vector/tests/VectorSaverTest.cpp index 144f33f55f3a..b345a3341f68 100644 --- a/velox/vector/tests/VectorSaverTest.cpp +++ b/velox/vector/tests/VectorSaverTest.cpp @@ -622,14 +622,6 @@ TEST_F(VectorSaverTest, LazyVector) { fuzzer); } -TEST_F(VectorSaverTest, stdVector) { - std::vector intVector = {1, 2, 3, 4, 5}; - auto path = exec::test::TempFilePath::create(); - saveStdVectorToFile(intVector, path->getPath().c_str()); - auto copy = restoreStdVectorFromFile(path->getPath().c_str()); - ASSERT_EQ(intVector, copy); -} - namespace { struct VectorSaverInfo { // Path to directory where to store the vector.