From 847b0286793af43ba63aa05a53c4f50cc96d1d02 Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Wed, 8 May 2024 10:08:07 -0700 Subject: [PATCH] Add direct support for double conversion and change double to string conversions.cpp. --- CMake/resolve_dependency_modules/README.md | 2 +- CMakeLists.txt | 1 + scripts/setup-macos.sh | 2 +- velox/expression/tests/CastExprTest.cpp | 4 + .../functions/prestosql/fuzzer/CMakeLists.txt | 11 + .../prestosql/fuzzer/CastFuzzerTest.cpp | 271 ++++++++++++++++++ velox/type/CMakeLists.txt | 3 +- velox/type/Conversions.cpp | 113 ++++++++ velox/type/Conversions.h | 77 +---- velox/vector/fuzzer/VectorFuzzer.cpp | 8 +- 10 files changed, 421 insertions(+), 71 deletions(-) create mode 100644 velox/functions/prestosql/fuzzer/CastFuzzerTest.cpp diff --git a/CMake/resolve_dependency_modules/README.md b/CMake/resolve_dependency_modules/README.md index a882732809787..2d66a9e37c312 100644 --- a/CMake/resolve_dependency_modules/README.md +++ b/CMake/resolve_dependency_modules/README.md @@ -27,7 +27,7 @@ by Velox. See details on bundling below. | flex | 2.5.13 | No | | bison | 3.0.4 | No | | cmake | 3.14 | No | -| double-conversion | 3.1.5 | No | +| double-conversion | 3.3.0 | No | | xsimd | 10.0.0 | Yes | | re2 | 2021-04-01 | Yes | | fmt | 10.1.1 | Yes | diff --git a/CMakeLists.txt b/CMakeLists.txt index 4885f49bcb3ad..10e03bc065ba3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -532,6 +532,7 @@ if(CMAKE_HOST_SYSTEM_NAME MATCHES "Darwin") endif() find_package(BISON 3.0.4 REQUIRED) find_package(FLEX 2.5.13 REQUIRED) +find_package(double-conversion REQUIRED) include_directories(SYSTEM velox) include_directories(SYSTEM velox/external) diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index f597d2d540034..b6bac15a81f04 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -112,7 +112,7 @@ function install_fbthrift { } function install_double_conversion { - github_checkout google/double-conversion v3.1.5 + github_checkout google/double-conversion v3.3.0 cmake_install -DBUILD_TESTING=OFF } diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index e7660ef058060..58d0944762813 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -1033,6 +1033,10 @@ TEST_F(CastExprTest, primitiveInvalidCornerCases) { } } +TEST_F(CastExprTest, primitivedtos) { + testCast("varchar", {-0.000416}, {"-4.16E-4"}); +} + TEST_F(CastExprTest, primitiveValidCornerCases) { // To integer. { diff --git a/velox/functions/prestosql/fuzzer/CMakeLists.txt b/velox/functions/prestosql/fuzzer/CMakeLists.txt index 392d206c17222..7e8d55362d124 100644 --- a/velox/functions/prestosql/fuzzer/CMakeLists.txt +++ b/velox/functions/prestosql/fuzzer/CMakeLists.txt @@ -37,3 +37,14 @@ target_link_libraries( velox_functions_prestosql gtest gtest_main) + +add_executable(velox_cast_fuzzer_test CastFuzzerTest.cpp) + +target_link_libraries( + velox_cast_fuzzer_test + velox_fuzzer_util + velox_aggregation_fuzzer_base + velox_expression_test_utility + velox_functions_prestosql + gtest + gtest_main) diff --git a/velox/functions/prestosql/fuzzer/CastFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/CastFuzzerTest.cpp new file mode 100644 index 0000000000000..908f9b282dd77 --- /dev/null +++ b/velox/functions/prestosql/fuzzer/CastFuzzerTest.cpp @@ -0,0 +1,271 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "velox/dwio/dwrf/writer/Writer.h" +#include "velox/exec/fuzzer/AggregationFuzzerRunner.h" +#include "velox/exec/fuzzer/PrestoQueryRunner.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/expression/Expr.h" +#include "velox/expression/RegisterSpecialForm.h" +#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" + +DEFINE_int64( + seed, + 0, + "Initial seed for random number generator used to reproduce previous " + "results (0 means start with random seed)."); + +DEFINE_string( + presto_url, + "http://127.0.0.1:8080", + "Presto coordinator URI along with port. If set, we use Presto " + "source of truth." + "--presto_url=http://127.0.0.1:8080"); + +DEFINE_uint32( + req_timeout_ms, + 5000, + "Timeout in milliseconds for HTTP requests made to reference DB, " + "such as Presto. Example: --req_timeout_ms=2000"); + +namespace facebook::velox::exec::test { + +auto constexpr batchSize = 1000; +auto constexpr kColName = "c0"; + +class CastFuzzer { + public: + CastFuzzer( + size_t initialSeed, + std::unique_ptr referenceQueryRunner) + : referenceQueryRunner_{std::move(referenceQueryRunner)} { + seed(initialSeed); + parse::registerTypeResolver(); + exec::registerFunctionCallToSpecialForms(); + facebook::velox::aggregate::prestosql::registerAllAggregateFunctions( + "", false, true); + } + + void go() { + VELOX_CHECK( + FLAGS_steps > 0 || FLAGS_duration_sec > 0, + "Either --steps or --duration_sec needs to be greater than zero.") + + auto startTime = std::chrono::system_clock::now(); + size_t iteration = 0; + + while (!isDone(iteration, startTime)) { + LOG(INFO) << "==============================> Started iteration " + << iteration << " (seed: " << currentSeed_ << ")"; + + TypePtr floatingType = vectorFuzzer_.coinToss(0.5) + ? static_cast(DOUBLE()) + : static_cast(REAL()); + + LOG(INFO) << "Choosing type:" << floatingType->toString(); + VELOX_CHECK_NOT_NULL(pool_.get()); + auto input = makeRowVector(floatingType, batchSize); + auto castInput = + std::make_shared( + floatingType, std::string(kColName)); + std::vector expr{ + std::make_shared( + VARCHAR(), castInput, false)}; + + exec::ExprSet exprSet(expr, &execCtx_, false); + SelectivityVector rows(input->size()); + VectorPtr veloxResults; + + // Evaluate Velox results. + evaluate(exprSet, input, rows, veloxResults); + + // Evaluate same results on Presto. + std::string sql = "SELECT cast (c0 as varchar) FROM tmp"; + auto prestoResults = referenceQueryRunner_->executeVector( + sql, {input}, ROW({kColName}, {VARCHAR()})); + VELOX_CHECK_GT(prestoResults.size(), 0); + + // Compare results, we are using a custom way of comparing results, + // since results obtained from Java can be off in the least significant + // digit. + auto veloxCastVector = veloxResults->asFlatVector(); + auto prestoCastVector = + prestoResults[0]->childAt(0)->asFlatVector(); + + auto exactMatch = 0; + auto leastSignificantDigitOff = 0; + + VELOX_CHECK_EQ(veloxCastVector->size(), prestoCastVector->size()); + + for (auto i = 0; i < veloxCastVector->size(); i++) { + auto matchType = compareVarchars(prestoCastVector, veloxCastVector, i); + + if (matchType == MatchType::EXACT_MATCH) { + exactMatch++; + } else if (matchType == MatchType::LEAST_SIGNIFICANT_DIGIT_OFF) { + leastSignificantDigitOff++; + } else { + VELOX_FAIL(fmt::format( + "Cast conversion not matching for: {} to {}", + prestoCastVector->valueAt(i), + veloxCastVector->valueAt(i))); + } + } + + LOG(INFO) << fmt::format( + "Results EXACT : {}, LEAST SIGNIFICANT: {}", + exactMatch, + leastSignificantDigitOff); + + seed(rng_()); + ++iteration; + } + }; + + private: + enum class MatchType { + EXACT_MATCH = 1, + LEAST_SIGNIFICANT_DIGIT_OFF, + NOT_A_MATCH + }; + + /// Compare's two StringView vectors at index i. The StringView vectors are + /// results of conversion of a floating point type to string. Returns + /// EXACT_MATCH if both the strings are eactly matching. + /// LEAST_SIGNIFICANT_DIGIT_OFF if they are off in the least siginficant digit + /// NOT_A_MATCH if exponents or any other part of mantissa doesnt match. + MatchType compareVarchars( + const FlatVector* expected, + const FlatVector* actual, + size_t i) { + VELOX_CHECK_EQ(expected->size(), actual->size()); + VELOX_CHECK_LT(i, expected->size()); + + if (expected->isNullAt(i) && actual->isNullAt(i)) { + return MatchType::EXACT_MATCH; + } else if (expected->isNullAt(i) || actual->isNullAt(i)) { + return MatchType::NOT_A_MATCH; + } + + auto expectedString = expected->valueAt(i).getString(); + auto actualString = actual->valueAt(i).getString(); + + if (expectedString == actualString) { + return MatchType::EXACT_MATCH; + } + + auto splitIntoMantissaExponent = [](std::string& val) { + auto pos = val.find('E'); + return std::tuple(val.substr(0, pos - 1), val.substr(pos + 1)); + }; + + auto [expectedMantissa, expectedExponent] = + splitIntoMantissaExponent(expectedString); + auto [actualMantissa, actualExponent] = + splitIntoMantissaExponent(actualString); + + // Exponents should always be equal + VELOX_CHECK_EQ(actualExponent, expectedExponent); + + if (expectedMantissa == actualMantissa) { + return MatchType::EXACT_MATCH; + } + + // Handle case when the last significant digit is off or missing + auto smallestLength = expectedMantissa.length() < actualMantissa.length() + ? expectedMantissa.length() + : actualMantissa.length(); + + if (expectedMantissa.substr(0, smallestLength) == + actualMantissa.substr(0, smallestLength)) { + return MatchType::LEAST_SIGNIFICANT_DIGIT_OFF; + } + + // Note we dont handle cases like ..5999 and ..6000 where last n digits get + // rounded off + return MatchType::NOT_A_MATCH; + } + + void evaluate( + exec::ExprSet& exprSet, + const RowVectorPtr& data, + const SelectivityVector& rows, + VectorPtr& result) { + exec::EvalCtx evalCtx(&execCtx_, &exprSet, data.get()); + std::vector results{result}; + exprSet.eval(rows, evalCtx, results); + + if (!result) { + result = results[0]; + } + } + + RowVectorPtr makeRowVector(const TypePtr& type, vector_size_t size) { + VectorFuzzer::Options opts; + opts.vectorSize = size; + opts.nullRatio = 0; + VectorPtr input = vectorFuzzer_.fuzzFlat(type); + return vectorMaker_.rowVector( + std::vector{std::string(kColName)}, + std::vector{std::move(input)}); + } + + void seed(size_t seed) { + currentSeed_ = seed; + vectorFuzzer_.reSeed(seed); + rng_.seed(currentSeed_); + } + + FuzzerGenerator rng_; + size_t currentSeed_{0}; + std::unique_ptr referenceQueryRunner_; + + std::shared_ptr rootPool_{ + memory::memoryManager()->addRootPool()}; + std::shared_ptr pool_{rootPool_->addLeafChild("leaf")}; + std::shared_ptr queryCtx_{std::make_shared()}; + core::ExecCtx execCtx_{pool_.get(), queryCtx_.get()}; + facebook::velox::test::VectorMaker vectorMaker_{pool_.get()}; + VectorFuzzer vectorFuzzer_{{}, pool_.get()}; +}; + +} // namespace facebook::velox::exec::test + +int main(int argc, char** argv) { + using namespace facebook::velox; + + ::testing::InitGoogleTest(&argc, argv); + folly::Init init(&argc, &argv); + + memory::MemoryManager::initialize({}); + dwrf::registerDwrfWriterFactory(); + + auto referenceQueryRunner = std::make_unique( + FLAGS_presto_url, + "aggregation_fuzzer", + static_cast(FLAGS_req_timeout_ms)); + + size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; + auto castFuzzer = facebook::velox::exec::test::CastFuzzer( + initialSeed, std::move(referenceQueryRunner)); + + castFuzzer.go(); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/velox/type/CMakeLists.txt b/velox/type/CMakeLists.txt index 326013c68c21a..78308717dbb5e 100644 --- a/velox/type/CMakeLists.txt +++ b/velox/type/CMakeLists.txt @@ -46,4 +46,5 @@ target_link_libraries( velox_status Boost::headers Folly::folly - re2::re2) + re2::re2 + double-conversion::double-conversion) diff --git a/velox/type/Conversions.cpp b/velox/type/Conversions.cpp index a49e22898945e..28aa85a24c6b2 100644 --- a/velox/type/Conversions.cpp +++ b/velox/type/Conversions.cpp @@ -23,3 +23,116 @@ DEFINE_bool( " format of type conversions used for casting. This is a temporary solution" " that aims to facilitate a seamless transition for users who rely on the" " legacy behavior and hence can change in the future."); + +namespace facebook::velox::util { +/// Normalize the given floating-point standard notation string in place, by +/// appending '.0' if it has only the integer part but no fractional part. For +/// example, for the given string '12345', replace it with '12345.0'. +void normalizeStandardNotation(std::string& str) { + if (!FLAGS_experimental_enable_legacy_cast && + str.find(".") == std::string::npos && isdigit(str[str.length() - 1])) { + str += ".0"; + } +} + +/// Normalize the given floating-point scientific notation string in place, by +/// removing the trailing 0s of the coefficient as well as the leading '+' and +/// 0s of the exponent. For example, for the given string '3.0000000E+005', +/// replace it with '3.0E5'. For '-1.2340000E-010', replace it with +/// '-1.234E-10'. +void normalizeScientificNotation(std::string& str) { + size_t idxE = str.find('E'); + + if (idxE == std::string::npos) { + VELOX_DCHECK_NE( + idxE, + std::string::npos, + "Expect a character 'E' in scientific notation."); + } + + int endCoef = idxE - 1; + while (endCoef >= 0 && str[endCoef] == '0') { + --endCoef; + } + if (endCoef == 0) { + VELOX_DCHECK_GT(endCoef, 0, "Coefficient should not be all zeros."); + } + + int pos = endCoef + 1; + if (str[endCoef] == '.') { + pos++; + } + str[pos++] = 'E'; + + int startExp = idxE + 1; + if (str[startExp] == '-') { + str[pos++] = '-'; + startExp++; + } + while (startExp < str.length() && + (str[startExp] == '0' || str[startExp] == '+')) { + startExp++; + } + VELOX_DCHECK_LT(startExp, str.length(), "Exponent should not be all zeros."); + str.replace(pos, str.length() - startExp, str, startExp); + pos += str.length() - startExp; + + str.resize(pos); +} + +template +std::string floatingToString(const T& val) { + using doubleToStringConv = double_conversion::DoubleToStringConverter; + + auto dconvertor = doubleToStringConv( + doubleToStringConv::EMIT_TRAILING_DECIMAL_POINT | + doubleToStringConv::EMIT_TRAILING_ZERO_AFTER_POINT | + doubleToStringConv::EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL | + doubleToStringConv::EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL, + "Infinity", + "NaN", + 'E', + std::is_same_v ? -5 : -7, + std::is_same_v ? -5 : 7, + 0, + 0); + + // Implementation below is close to String.of(double) of Java. For + // example, for some rare cases the result differs in precision by + // the least significant bit. + if (FOLLY_UNLIKELY(std::isinf(val) || std::isnan(val))) { + return folly::to(val); + } + if ((val > -10'000'000 && val <= -0.001) || + (val >= 0.001 && val < 10'000'000) || val == 0.0) { + auto str = fmt::format("{}", val); + normalizeStandardNotation(str); + return str; + } + + // We choose a buffer size of 22 which allows us to cover precision of 17 + // digits , 4 digits of exponent , and the exponent char 'E'. + const int kBufferSize = 22; + char buffer[kBufferSize]; + double_conversion::StringBuilder builder{buffer, kBufferSize}; + if constexpr (std::is_same_v) { + dconvertor.ToShortestSingle(val, &builder); + } else { + dconvertor.ToExponential(val, 16, &builder); + } + + auto convertedResult = std::string{builder.Finalize()}; + + normalizeScientificNotation(convertedResult); + return convertedResult; +} + +std::string floatToString(const float& val) { + return floatingToString(val); +} + +std::string doubleToString(const double& val) { + return floatingToString(val); +} + +} // namespace facebook::velox::util diff --git a/velox/type/Conversions.h b/velox/type/Conversions.h index 0377d14be338c..ded1b009ced78 100644 --- a/velox/type/Conversions.h +++ b/velox/type/Conversions.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -453,6 +454,12 @@ struct Converter { } }; +// Added to make build's faster. +std::string floatToString(const float& val); +std::string doubleToString(const double& val); +void normalizeScientificNotation(std::string& str); +void normalizeStandardNotation(std::string& str); + template struct Converter { template @@ -464,24 +471,11 @@ struct Converter { return str; } - // Implementation below is close to String.of(double) of Java. For - // example, for some rare cases the result differs in precision by - // the least significant bit. - if (FOLLY_UNLIKELY(std::isinf(val) || std::isnan(val))) { - return folly::to(val); - } - if ((val > -10'000'000 && val <= -0.001) || - (val >= 0.001 && val < 10'000'000) || val == 0.0) { - auto str = fmt::format("{}", val); - normalizeStandardNotation(str); - return str; + if constexpr (std::is_same_v) { + return doubleToString(val); + } else { + return floatToString(val); } - // Precision of float is at most 8 significant decimal digits. Precision - // of double is at most 17 significant decimal digits. - auto str = - fmt::format(std::is_same_v ? "{:.7E}" : "{:.16E}", val); - normalizeScientificNotation(str); - return str; } return folly::to(val); @@ -501,56 +495,7 @@ struct Converter { return val ? "true" : "false"; } - /// Normalize the given floating-point standard notation string in place, by - /// appending '.0' if it has only the integer part but no fractional part. For - /// example, for the given string '12345', replace it with '12345.0'. - static void normalizeStandardNotation(std::string& str) { - if (!FLAGS_experimental_enable_legacy_cast && - str.find(".") == std::string::npos && isdigit(str[str.length() - 1])) { - str += ".0"; - } - } - - /// Normalize the given floating-point scientific notation string in place, by - /// removing the trailing 0s of the coefficient as well as the leading '+' and - /// 0s of the exponent. For example, for the given string '3.0000000E+005', - /// replace it with '3.0E5'. For '-1.2340000E-010', replace it with - /// '-1.234E-10'. - static void normalizeScientificNotation(std::string& str) { - size_t idxE = str.find('E'); - VELOX_DCHECK_NE( - idxE, - std::string::npos, - "Expect a character 'E' in scientific notation."); - - int endCoef = idxE - 1; - while (endCoef >= 0 && str[endCoef] == '0') { - --endCoef; - } - VELOX_DCHECK_GT(endCoef, 0, "Coefficient should not be all zeros."); - - int pos = endCoef + 1; - if (str[endCoef] == '.') { - pos++; - } - str[pos++] = 'E'; - int startExp = idxE + 1; - if (str[startExp] == '-') { - str[pos++] = '-'; - startExp++; - } - while (startExp < str.length() && - (str[startExp] == '0' || str[startExp] == '+')) { - startExp++; - } - VELOX_DCHECK_LT( - startExp, str.length(), "Exponent should not be all zeros."); - str.replace(pos, str.length() - startExp, str, startExp); - pos += str.length() - startExp; - - str.resize(pos); - } }; // Allow conversions from string to TIMESTAMP type. diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index bb4578d42b203..5088c908c6b1e 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -90,12 +90,16 @@ int64_t rand(FuzzerGenerator& rng) { template <> double rand(FuzzerGenerator& rng) { - return boost::random::uniform_01()(rng); + auto d = boost::random::uniform_01()(rng); + auto scale = rand(rng); + return double(d * scale); } template <> float rand(FuzzerGenerator& rng) { - return boost::random::uniform_01()(rng); + auto f = boost::random::uniform_01()(rng); + auto scale = rand(rng); + return f * scale; } template <>