diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index e8a68eda92f5b..8e619be32d6e0 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -91,7 +91,7 @@ jobs: - name: Make Release Build env: MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' - CUDA_ARCHITECTURES: 60 + CUDA_ARCHITECTURES: 70 CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc # Without that, nvcc picks /usr/bin/c++ which is GCC 8 CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-9/root/usr/bin" diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index cfc0742056692..68814db1ad74d 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -264,14 +264,14 @@ jobs: uses: actions/upload-artifact@v4 with: name: presto - path: velox/_build/debug/velox/expression/tests/velox_expression_fuzzer_test + path: velox/_build/debug/velox/expression/fuzzer/velox_expression_fuzzer_test retention-days: "${{ env.RETENTION }}" - name: Upload spark expression fuzzer uses: actions/upload-artifact@v4 with: name: spark_expression_fuzzer - path: velox/_build/debug/velox/expression/tests/spark_expression_fuzzer_test + path: velox/_build/debug/velox/expression/fuzzer/spark_expression_fuzzer_test retention-days: "${{ env.RETENTION }}" - name: Upload spark aggregation fuzzer diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index c4d716cbc4f56..0000000000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "third_party/googletest"] - path = third_party/googletest - url = https://github.com/google/googletest.git -[submodule "third_party/xsimd"] - path = third_party/xsimd - url = https://github.com/xtensor-stack/xsimd.git diff --git a/Makefile b/Makefile index 3e8ff17397bdf..b604d560adf64 100644 --- a/Makefile +++ b/Makefile @@ -161,7 +161,7 @@ unittest: debug #: Build with debugging and run unit tests # Build with debugging and run expression fuzzer test. Use a fixed seed to # ensure the tests are reproducible. fuzzertest: debug - $(BUILD_BASE_DIR)/debug/velox/expression/tests/velox_expression_fuzzer_test \ + $(BUILD_BASE_DIR)/debug/velox/expression/fuzzer/velox_expression_fuzzer_test \ --seed $(FUZZER_SEED) \ --duration_sec $(FUZZER_DURATION_SEC) \ --repro_persist_path $(FUZZER_REPRO_PERSIST_PATH) \ diff --git a/README.md b/README.md index 69fd642a1d38a..3e1dbc3cade3d 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,8 @@ Blog posts are available [here](https://velox-lib.io/blog). ### Get the Velox Source ``` -git clone --recursive https://github.com/facebookincubator/velox.git +git clone https://github.com/facebookincubator/velox.git cd velox -# if you are updating an existing checkout -git submodule sync --recursive -git submodule update --init --recursive ``` Once Velox is checked out, the first step is to install the dependencies. Details on the dependencies and how Velox manages some of them for you @@ -90,7 +87,7 @@ dependencies for a given platform. On an Intel MacOS machine you can setup and then build like so: ```shell -$ ./scripts/setup-macos.sh +$ ./scripts/setup-macos.sh $ make ``` @@ -117,7 +114,7 @@ $ CPU_TARGET="aarch64" make Once you have checked out Velox, you can setup and build like so: ```shell -$ ./scripts/setup-ubuntu.sh +$ ./scripts/setup-ubuntu.sh $ make ``` @@ -135,7 +132,7 @@ Note that, * f16c * Velox tries to use the following (or equivalent) instruction sets where available: * On Intel CPUs - * avx + * avx * avx2 * sse * On ARM @@ -167,7 +164,7 @@ contribute to the project. ## Community The main communication channel with the Velox OSS community is through the -[the Velox-OSS Slack workspace](http://velox-oss.slack.com). +[the Velox-OSS Slack workspace](http://velox-oss.slack.com). Please reach out to **velox@meta.com** to get access to Velox Slack Channel. diff --git a/velox/common/base/CMakeLists.txt b/velox/common/base/CMakeLists.txt index 37ccfe6f8f894..d70afb59d639a 100644 --- a/velox/common/base/CMakeLists.txt +++ b/velox/common/base/CMakeLists.txt @@ -22,6 +22,7 @@ add_library( BitUtil.cpp Counters.cpp Fs.cpp + PeriodicStatsReporter.cpp RandomUtil.cpp RawVector.cpp RuntimeMetrics.cpp diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 6cf0a8fcf8c82..f7e8ab9951fdc 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -48,6 +48,24 @@ void registerVeloxMetrics() { /// ================== Memory Arbitration Counters ================= + // The number of arbitration requests. + DEFINE_METRIC( + kMetricArbitratorRequestsCount, facebook::velox::StatType::COUNT); + + // The number of times a query level memory pool is aborted as a result of a + // memory arbitration process. The memory pool aborted will eventually result + // in a cancelling of the original query. + DEFINE_METRIC( + kMetricArbitratorAbortedCount, facebook::velox::StatType::COUNT); + + // The number of times a memory arbitration request failed. This may occur + // either because the requester was terminated during the processing of its + // request, the arbitration request would surpass the maximum allowed capacity + // for the requester, or the arbitration process couldn't release the + // requested amount of memory. + DEFINE_METRIC( + kMetricArbitratorFailuresCount, facebook::velox::StatType::COUNT); + // Tracks the memory reclaim count on an operator. DEFINE_METRIC(kMetricMemoryReclaimCount, facebook::velox::StatType::COUNT); @@ -82,10 +100,6 @@ void registerVeloxMetrics() { DEFINE_METRIC( kMetricMemoryNonReclaimableCount, facebook::velox::StatType::COUNT); - // The number of arbitration requests. - DEFINE_METRIC( - kMetricArbitratorRequestsCount, facebook::velox::StatType::COUNT); - // The number of arbitration that reclaims the used memory from the query // which initiates the memory arbitration request itself. It ensures the // memory arbitration request won't exceed its per-query memory capacity @@ -103,20 +117,6 @@ void registerVeloxMetrics() { kMetricArbitratorGlobalArbitrationCount, facebook::velox::StatType::COUNT); - // The number of times a query level memory pool is aborted as a result of a - // memory arbitration process. The memory pool aborted will eventually result - // in a cancelling the original query. - DEFINE_METRIC( - kMetricArbitratorAbortedCount, facebook::velox::StatType::COUNT); - - // The number of times a memory arbitration request failed. This may occur - // either because the requester was terminated during the processing of its - // request, the arbitration request would surpass the maximum allowed capacity - // for the requester, or the arbitration process couldn't release the - // requested amount of memory. - DEFINE_METRIC( - kMetricArbitratorFailuresCount, facebook::velox::StatType::COUNT); - // The distribution of the amount of time an arbitration request stays queued // in range of [0, 600s] with 20 buckets. It is configured to report the // latency at P50, P90, P99, and P100 percentiles. diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 67fd967c938ef..f1fca90d8ea31 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -70,21 +70,12 @@ constexpr folly::StringPiece kMetricMemoryPoolReservationLeakBytes{ constexpr folly::StringPiece kMetricMemoryAllocatorDoubleFreeCount{ "velox.memory_allocator_double_free_count"}; -constexpr folly::StringPiece kMetricArbitratorRequestsCount{ - "velox.arbitrator_requests_count"}; - constexpr folly::StringPiece kMetricArbitratorLocalArbitrationCount{ "velox.arbitrator_local_arbitration_count"}; constexpr folly::StringPiece kMetricArbitratorGlobalArbitrationCount{ "velox.arbitrator_global_arbitration_count"}; -constexpr folly::StringPiece kMetricArbitratorAbortedCount{ - "velox.arbitrator_aborted_count"}; - -constexpr folly::StringPiece kMetricArbitratorFailuresCount{ - "velox.arbitrator_failures_count"}; - constexpr folly::StringPiece kMetricArbitratorQueueTimeMs{ "velox.arbitrator_queue_time_ms"}; @@ -128,4 +119,13 @@ constexpr folly::StringPiece kMetricSpillWriteTimeMs{ constexpr folly::StringPiece kMetricFileWriterEarlyFlushedRawBytes{ "velox.file_writer_early_flushed_raw_bytes"}; + +constexpr folly::StringPiece kMetricArbitratorRequestsCount{ + "velox.arbitrator_requests_count"}; + +constexpr folly::StringPiece kMetricArbitratorAbortedCount{ + "velox.arbitrator_aborted_count"}; + +constexpr folly::StringPiece kMetricArbitratorFailuresCount{ + "velox.arbitrator_failures_count"}; } // namespace facebook::velox diff --git a/velox/common/base/PeriodicStatsReporter.cpp b/velox/common/base/PeriodicStatsReporter.cpp new file mode 100644 index 0000000000000..f3c3b6e5b6086 --- /dev/null +++ b/velox/common/base/PeriodicStatsReporter.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/PeriodicStatsReporter.h" +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/memory/Memory.h" + +namespace facebook::velox { + +namespace { +#define REPORT_IF_NOT_ZERO(name, counter) \ + if ((counter) != 0) { \ + RECORD_METRIC_VALUE((name), (counter)); \ + } + +std::mutex& instanceMutex() { + static std::mutex instanceMu; + return instanceMu; +} + +// Global instance. Must be called while holding a lock over instanceMutex(). +std::unique_ptr& instance() { + static std::unique_ptr reporter; + return reporter; +} +} // namespace + +void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options) { + std::lock_guard l(instanceMutex()); + auto& instanceRef = instance(); + VELOX_CHECK_NULL( + instanceRef, "The periodic stats reporter has already started."); + instanceRef = std::make_unique(options); + instanceRef->start(); +} + +void stopPeriodicStatsReporter() { + std::lock_guard l(instanceMutex()); + auto& instanceRef = instance(); + VELOX_CHECK_NOT_NULL(instanceRef, "No periodic stats reporter to stop."); + instanceRef->stop(); + instanceRef.reset(); +} + +PeriodicStatsReporter::PeriodicStatsReporter(const Options& options) + : arbitrator_(options.arbitrator), options_(options) {} + +void PeriodicStatsReporter::start() { + LOG(INFO) << "Starting PeriodicStatsReporter with options " + << options_.toString(); + addTask( + "report_arbitrator_stats", + [this]() { reportArbitratorStats(); }, + options_.arbitratorStatsIntervalMs); +} + +void PeriodicStatsReporter::stop() { + LOG(INFO) << "Stopping PeriodicStatsReporter"; + scheduler_.stop(); +} + +void PeriodicStatsReporter::reportArbitratorStats() { + if (arbitrator_ == nullptr) { + return; + } + + const auto stats = arbitrator_->stats(); + RECORD_METRIC_VALUE( + kMetricArbitratorFreeCapacityBytes, + stats.freeCapacityBytes + stats.freeReservedCapacityBytes); + RECORD_METRIC_VALUE( + kMetricArbitratorFreeReservedCapacityBytes, + stats.freeReservedCapacityBytes); +} + +} // namespace facebook::velox diff --git a/velox/common/base/PeriodicStatsReporter.h b/velox/common/base/PeriodicStatsReporter.h new file mode 100644 index 0000000000000..7621ac99a01cb --- /dev/null +++ b/velox/common/base/PeriodicStatsReporter.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/common/memory/MemoryArbitrator.h" + +namespace folly { +class CPUThreadPoolExecutor; +} + +namespace facebook::velox { + +namespace memory { +class MemoryAllocator; +} + +namespace cache { +class AsyncDataCache; +} + +/// Manages a background daemon thread to report stats through 'StatsReporter'. +class PeriodicStatsReporter { + public: + struct Options { + Options() {} + + const memory::MemoryArbitrator* arbitrator{nullptr}; + + uint64_t arbitratorStatsIntervalMs{60'000}; + + std::string toString() const { + return fmt::format( + "arbitratorStatsIntervalMs:{}", arbitratorStatsIntervalMs); + } + }; + + PeriodicStatsReporter(const Options& options = Options()); + + /// Invoked to start the report daemon in background. + void start(); + + /// Invoked to stop the report daemon in background. + void stop(); + + private: + // Add a task to run periodically. + template + void addTask(const std::string& taskName, TFunc&& func, size_t intervalMs) { + scheduler_.add( + taskName, + [taskName, + intervalMs, + func = std::forward(func)]() mutable noexcept { + try { + func(); + } catch (const std::exception& e) { + LOG(ERROR) << "Error running periodic task " << taskName << ": " + << e.what(); + } + return std::chrono::milliseconds(intervalMs); + }); + } + + void reportArbitratorStats(); + + const velox::memory::MemoryArbitrator* const arbitrator_{nullptr}; + const Options options_; + + folly::ThreadedRepeatingFunctionRunner scheduler_; +}; + +/// Initializes and starts the process-wide periodic stats reporter. Before +/// 'stopPeriodicStatsReporter()' is called, this method can only be called once +/// process-wide, and additional calls to this method will throw. +void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options); + +/// Stops the process-wide periodic stats reporter. +void stopPeriodicStatsReporter(); + +} // namespace facebook::velox diff --git a/velox/common/base/VeloxException.cpp b/velox/common/base/VeloxException.cpp index 701edb16c5197..12371def75f2e 100644 --- a/velox/common/base/VeloxException.cpp +++ b/velox/common/base/VeloxException.cpp @@ -43,29 +43,31 @@ ExceptionContext& getExceptionContext() { return context; } -// Retrieves the message of the top-level ancestor of the current exception -// context. If the top-level context message is not empty and is the same as the -// current one, returns a string indicating they are the same. -std::string getTopLevelExceptionContextString( +// Traverses the context hierarchy and appends messages from all contexts that +// are marked as essential. +std::string getAdditionalExceptionContextString( VeloxException::Type exceptionType, const std::string& currentMessage) { auto* context = &getExceptionContext(); - if (context->parent && context->parent->parent) { - while (context->parent && context->parent->parent) { - context = context->parent; - } - auto topLevelMessage = context->message(exceptionType); - if (!topLevelMessage.empty() && topLevelMessage == currentMessage) { - return "Same as context."; - } else { - return topLevelMessage; + std::string additionalMessage = ""; + if (!context->parent || !context->parent->parent) { + return additionalMessage; + } + context = context->parent; + while (context->parent) { + if (context->isEssential) { + auto message = context->message(exceptionType); + if (!message.empty()) { + additionalMessage += message + " "; + } } + context = context->parent; } - - if (!currentMessage.empty()) { - return "Same as context."; + if (!additionalMessage.empty()) { + // Get rid of the extra space at the end. + additionalMessage.pop_back(); } - return ""; + return additionalMessage; } VeloxException::VeloxException( @@ -90,8 +92,8 @@ VeloxException::VeloxException( state.errorSource = errorSource; state.errorCode = errorCode; state.context = getExceptionContext().message(exceptionType); - state.topLevelContext = - getTopLevelExceptionContextString(exceptionType, state.context); + state.additionalContext = + getAdditionalExceptionContextString(exceptionType, state.context); state.isRetriable = isRetriable; })) {} @@ -114,8 +116,8 @@ VeloxException::VeloxException( state.errorSource = errorSource; state.errorCode = errorCode; state.context = getExceptionContext().message(exceptionType); - state.topLevelContext = - getTopLevelExceptionContextString(exceptionType, state.context); + state.additionalContext = + getAdditionalExceptionContextString(exceptionType, state.context); state.isRetriable = isRetriable; state.wrappedException = e; })) {} @@ -223,8 +225,8 @@ void VeloxException::State::finalize() const { elaborateMessage += "Context: " + context + "\n"; } - if (!topLevelContext.empty()) { - elaborateMessage += "Top-Level Context: " + topLevelContext + "\n"; + if (!additionalContext.empty()) { + elaborateMessage += "Additional Context: " + additionalContext + "\n"; } if (function) { diff --git a/velox/common/base/VeloxException.h b/velox/common/base/VeloxException.h index 32e96b9a166e8..ae7b8fdab46b3 100644 --- a/velox/common/base/VeloxException.h +++ b/velox/common/base/VeloxException.h @@ -207,8 +207,8 @@ class VeloxException : public std::exception { return state_->context; } - const std::string& topLevelContext() const { - return state_->topLevelContext; + const std::string& additionalContext() const { + return state_->additionalContext; } const std::exception_ptr& wrappedException() const { @@ -230,7 +230,7 @@ class VeloxException : public std::exception { // The current exception context. std::string context; // The top-level ancestor of the current exception context. - std::string topLevelContext; + std::string additionalContext; bool isRetriable; // The original std::exception. std::exception_ptr wrappedException; @@ -353,6 +353,10 @@ struct ExceptionContext { /// Value to pass to `messageFunc`. Can be null. void* arg{nullptr}; + /// If true, then the addition context in 'this' is always included when there + /// are hierarchical exception contexts. + bool isEssential{false}; + /// Pointer to the parent context when there are hierarchical exception /// contexts. ExceptionContext* parent{nullptr}; diff --git a/velox/common/base/tests/CMakeLists.txt b/velox/common/base/tests/CMakeLists.txt index ebab3d8f75504..22c173c99e3d4 100644 --- a/velox/common/base/tests/CMakeLists.txt +++ b/velox/common/base/tests/CMakeLists.txt @@ -23,9 +23,9 @@ add_executable( FsTest.cpp RangeTest.cpp RawVectorTest.cpp - ScratchTest.cpp RuntimeMetricsTest.cpp ScopedLockTest.cpp + ScratchTest.cpp SemaphoreTest.cpp SimdUtilTest.cpp SpillConfigTest.cpp @@ -38,7 +38,9 @@ add_test(velox_base_test velox_base_test) target_link_libraries( velox_base_test - PRIVATE velox_common_base + PRIVATE velox_caching + velox_common_base + velox_memory velox_time velox_status velox_exception diff --git a/velox/common/base/tests/ExceptionTest.cpp b/velox/common/base/tests/ExceptionTest.cpp index 4e5dd6dbaa54d..9386b8cb672e0 100644 --- a/velox/common/base/tests/ExceptionTest.cpp +++ b/velox/common/base/tests/ExceptionTest.cpp @@ -583,11 +583,13 @@ TEST(ExceptionTest, context) { }; { - // Create multi-layer contexts. + // Create multi-layer contexts with top level marked as essential. MessageFunctionArg topLevelTroubleshootingAid{ "Top-level troubleshooting aid.", &callCount}; - facebook::velox::ExceptionContextSetter topLevelContext( - {messageFunction, &topLevelTroubleshootingAid}); + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, + .arg = &topLevelTroubleshootingAid, + .isEssential = true}); MessageFunctionArg midLevelTroubleshootingAid{ "Mid-level troubleshooting aid.", &callCount}; @@ -608,7 +610,7 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: System error: Inner-level troubleshooting aid." - "\nTop-Level Context: System error: Top-level troubleshooting aid." + "\nAdditional Context: System error: Top-level troubleshooting aid." "\nFunction: operator()" "\nFile: "); @@ -623,13 +625,164 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: User error: Inner-level troubleshooting aid." - "\nTop-Level Context: User error: Top-level troubleshooting aid." + "\nAdditional Context: User error: Top-level troubleshooting aid." "\nFunction: operator()" "\nFile: "); EXPECT_EQ(4, callCount); } + { + callCount = 0; + // Create multi-layer contexts with middle level marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, + .arg = &midLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nAdditional Context: System error: Mid-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(2, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nAdditional Context: User error: Mid-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(4, callCount); + } + + { + callCount = 0; + // Create multi-layer contexts with none marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, .arg = &midLevelTroubleshootingAid}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(1, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(2, callCount); + } + + { + callCount = 0; + // Create multi-layer contexts with all ancestors marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, + .arg = &topLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, + .arg = &midLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nAdditional Context: System error: Mid-level troubleshooting aid. System error: Top-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(3, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nAdditional Context: User error: Mid-level troubleshooting aid. User error: Top-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(6, callCount); + } + // Different context. { callCount = 0; @@ -649,7 +802,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: System error: Debugging info." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -664,7 +816,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: User error: Debugging info." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -709,7 +860,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: Failed to produce additional context." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -743,7 +893,7 @@ TEST(ExceptionTest, wrappedException) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), ""); - ASSERT_EQ(ve.topLevelContext(), ""); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -755,7 +905,7 @@ TEST(ExceptionTest, wrappedException) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), ""); - ASSERT_EQ(ve.topLevelContext(), ""); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -784,7 +934,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { std::string data = "lakes"; facebook::velox::ExceptionContextSetter context( - {messageFunction, data.data()}); + {messageFunction, data.data(), true}); try { throw std::invalid_argument("This is a test."); @@ -793,7 +943,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), "User error: lakes"); - ASSERT_EQ(ve.topLevelContext(), "Same as context."); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -805,7 +955,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), "System error: lakes"); - ASSERT_EQ(ve.topLevelContext(), "Same as context."); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -821,7 +971,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), "User error: mountains"); - ASSERT_EQ(ve.topLevelContext(), "User error: lakes"); + ASSERT_EQ(ve.additionalContext(), "User error: lakes"); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -833,7 +983,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), "System error: mountains"); - ASSERT_EQ(ve.topLevelContext(), "System error: lakes"); + ASSERT_EQ(ve.additionalContext(), "System error: lakes"); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } diff --git a/velox/common/base/tests/StatsReporterTest.cpp b/velox/common/base/tests/StatsReporterTest.cpp index b405f72513839..fd7e89a6f35fe 100644 --- a/velox/common/base/tests/StatsReporterTest.cpp +++ b/velox/common/base/tests/StatsReporterTest.cpp @@ -20,15 +20,13 @@ #include #include #include +#include +#include "velox/common/base/Counters.h" +#include "velox/common/base/PeriodicStatsReporter.h" +#include "velox/common/base/tests/GTestUtils.h" namespace facebook::velox { -class StatsReporterTest : public testing::Test { - protected: - void SetUp() override {} - void TearDown() override {} -}; - class TestReporter : public BaseStatsReporter { public: mutable std::unordered_map counterMap; @@ -36,6 +34,12 @@ class TestReporter : public BaseStatsReporter { mutable std::unordered_map> histogramPercentilesMap; + void clear() { + counterMap.clear(); + statTypeMap.clear(); + histogramPercentilesMap.clear(); + } + void registerMetricExportType(const char* key, StatType statType) const override { statTypeMap[key] = statType; @@ -92,22 +96,32 @@ class TestReporter : public BaseStatsReporter { } }; -TEST_F(StatsReporterTest, trivialReporter) { - auto reporter = std::dynamic_pointer_cast( - folly::Singleton::try_get()); +class StatsReporterTest : public testing::Test { + protected: + void SetUp() override { + reporter_ = std::dynamic_pointer_cast( + folly::Singleton::try_get()); + } + void TearDown() override { + reporter_->clear(); + } + + std::shared_ptr reporter_; +}; +TEST_F(StatsReporterTest, trivialReporter) { DEFINE_METRIC("key1", StatType::COUNT); DEFINE_METRIC("key2", StatType::SUM); DEFINE_METRIC("key3", StatType::RATE); DEFINE_HISTOGRAM_METRIC("key4", 10, 0, 100, 50, 99, 100); - EXPECT_EQ(StatType::COUNT, reporter->statTypeMap["key1"]); - EXPECT_EQ(StatType::SUM, reporter->statTypeMap["key2"]); - EXPECT_EQ(StatType::RATE, reporter->statTypeMap["key3"]); + EXPECT_EQ(StatType::COUNT, reporter_->statTypeMap["key1"]); + EXPECT_EQ(StatType::SUM, reporter_->statTypeMap["key2"]); + EXPECT_EQ(StatType::RATE, reporter_->statTypeMap["key3"]); std::vector expected = {50, 99, 100}; - EXPECT_EQ(expected, reporter->histogramPercentilesMap["key4"]); + EXPECT_EQ(expected, reporter_->histogramPercentilesMap["key4"]); EXPECT_TRUE( - reporter->statTypeMap.find("key5") == reporter->statTypeMap.end()); + reporter_->statTypeMap.find("key5") == reporter_->statTypeMap.end()); RECORD_METRIC_VALUE("key1", 10); RECORD_METRIC_VALUE("key1", 11); @@ -119,12 +133,101 @@ TEST_F(StatsReporterTest, trivialReporter) { RECORD_HISTOGRAM_METRIC_VALUE("key4", 50); RECORD_HISTOGRAM_METRIC_VALUE("key4", 100); - EXPECT_EQ(36, reporter->counterMap["key1"]); - EXPECT_EQ(2201, reporter->counterMap["key2"]); - EXPECT_EQ(1101, reporter->counterMap["key3"]); - EXPECT_EQ(100, reporter->counterMap["key4"]); + EXPECT_EQ(36, reporter_->counterMap["key1"]); + EXPECT_EQ(2201, reporter_->counterMap["key2"]); + EXPECT_EQ(1101, reporter_->counterMap["key3"]); + EXPECT_EQ(100, reporter_->counterMap["key4"]); +}; + +class PeriodicStatsReporterTest : public StatsReporterTest {}; + +class TestStatsReportMemoryArbitrator : public memory::MemoryArbitrator { + public: + explicit TestStatsReportMemoryArbitrator( + memory::MemoryArbitrator::Stats stats) + : memory::MemoryArbitrator({}), stats_(stats) {} + + ~TestStatsReportMemoryArbitrator() override = default; + + void updateStats(memory::MemoryArbitrator::Stats stats) { + stats_ = stats; + } + + std::string kind() const override { + return "test"; + } + + uint64_t growCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/) + override { + return 0; + } + + bool growCapacity( + memory::MemoryPool* /*unused*/, + const std::vector>& /*unused*/, + uint64_t /*unused*/) override { + return false; + } + + uint64_t shrinkCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/) + override { + return 0; + } + + uint64_t shrinkCapacity( + const std::vector>& /*unused*/, + uint64_t /*unused*/, + bool /*unused*/, + bool /*unused*/) override { + return 0; + } + + Stats stats() const override { + return stats_; + } + + std::string toString() const override { + return "TestStatsReportMemoryArbitrator::toString()"; + } + + private: + memory::MemoryArbitrator::Stats stats_; }; +TEST_F(PeriodicStatsReporterTest, basic) { + TestStatsReportMemoryArbitrator arbitrator({}); + PeriodicStatsReporter::Options options; + options.arbitrator = &arbitrator; + options.arbitratorStatsIntervalMs = 4'000; + PeriodicStatsReporter periodicReporter(options); + + periodicReporter.start(); + std::this_thread::sleep_for(std::chrono::milliseconds(2'000)); + // Stop right after sufficient wait to ensure the following reads from main + // thread does not trigger TSAN failures. + periodicReporter.stop(); + + const auto& counterMap = reporter_->counterMap; + ASSERT_EQ(counterMap.size(), 2); + ASSERT_EQ(counterMap.count(kMetricArbitratorFreeCapacityBytes.str()), 1); + ASSERT_EQ( + counterMap.count(kMetricArbitratorFreeReservedCapacityBytes.str()), 1); +} + +TEST_F(PeriodicStatsReporterTest, globalInstance) { + TestStatsReportMemoryArbitrator arbitrator({}); + PeriodicStatsReporter::Options options; + options.arbitrator = &arbitrator; + options.arbitratorStatsIntervalMs = 4'000; + VELOX_ASSERT_THROW( + stopPeriodicStatsReporter(), "No periodic stats reporter to stop."); + ASSERT_NO_THROW(startPeriodicStatsReporter(options)); + VELOX_ASSERT_THROW( + startPeriodicStatsReporter(options), + "The periodic stats reporter has already started."); + ASSERT_NO_THROW(stopPeriodicStatsReporter()); +} + // Registering to folly Singleton with intended reporter type folly::Singleton reporter([]() { return new TestReporter(); diff --git a/velox/common/caching/AsyncDataCache.h b/velox/common/caching/AsyncDataCache.h index dd09e560543fa..b77a05952c8d6 100644 --- a/velox/common/caching/AsyncDataCache.h +++ b/velox/common/caching/AsyncDataCache.h @@ -716,7 +716,7 @@ class AsyncDataCache : public memory::Cache { /// Returns snapshot of the aggregated stats from all shards and the stats of /// SSD cache if used. - CacheStats refreshStats() const; + virtual CacheStats refreshStats() const; /// If 'details' is true, returns the stats of the backing memory allocator /// and ssd cache. Otherwise, only returns the cache stats. diff --git a/velox/common/caching/SsdCache.cpp b/velox/common/caching/SsdCache.cpp index 5b11fe89eebfc..d48eea9ae254a 100644 --- a/velox/common/caching/SsdCache.cpp +++ b/velox/common/caching/SsdCache.cpp @@ -60,7 +60,8 @@ SsdCache::SsdCache( i, fileMaxRegions, checkpointIntervalBytes / numShards, - disableFileCow)); + disableFileCow, + executor_)); } } diff --git a/velox/common/caching/SsdCache.h b/velox/common/caching/SsdCache.h index 2370bf00d68d4..20c5d6e0b87cf 100644 --- a/velox/common/caching/SsdCache.h +++ b/velox/common/caching/SsdCache.h @@ -105,6 +105,10 @@ class SsdCache { std::string toString() const; + const std::string& filePrefix() const { + return filePrefix_; + } + private: const std::string filePrefix_; const int32_t numShards_; diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index 4c9483de2564f..e977f610ebe3a 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -135,6 +135,7 @@ SsdFile::SsdFile( folly::Executor* executor) : fileName_(filename), maxRegions_(maxRegions), + disableFileCow_(disableFileCow), shardId_(shardId), checkpointIntervalBytes_(checkpointIntervalBytes), executor_(executor) { @@ -155,7 +156,7 @@ SsdFile::SsdFile( filename, folly::errnoStr(errno)); - if (disableFileCow) { + if (disableFileCow_) { disableCow(fd_); } @@ -346,6 +347,7 @@ bool SsdFile::growOrEvictLocked() { logEviction(candidates); clearRegionEntriesLocked(candidates); + stats_.regionsEvicted += candidates.size(); writableRegions_ = std::move(candidates); suspended_ = false; return true; @@ -531,6 +533,7 @@ void SsdFile::updateStats(SsdCacheStats& stats) const { } stats.entriesAgedOut += stats_.entriesAgedOut; stats.regionsAgedOut += stats_.regionsAgedOut; + stats.regionsEvicted += stats_.regionsEvicted; for (auto pins : regionPins_) { stats.numPins += pins; } @@ -711,15 +714,6 @@ void SsdFile::checkpoint(bool force) { checkpointDeleted_ = false; bytesAfterCheckpoint_ = 0; try { - // We schedule the potentially long fsync of the cache file on another - // thread of the cache write executor, if available. If there is none, we do - // the sync on this thread at the end. - auto fileSync = std::make_shared>( - [fd = fd_]() { return std::make_unique(::fsync(fd)); }); - if (executor_ != nullptr) { - executor_->add([fileSync]() { fileSync->prepare(); }); - } - const auto checkRc = [&](int32_t rc, const std::string& errMsg) { if (rc < 0) { VELOX_FAIL("{} with rc {} :{}", errMsg, rc, folly::errnoStr(errno)); @@ -769,6 +763,15 @@ void SsdFile::checkpoint(bool force) { state.write(asChar(&offsetAndSize), sizeof(offsetAndSize)); } + // We schedule the potentially long fsync of the cache file on another + // thread of the cache write executor, if available. If there is none, we do + // the sync on this thread at the end. + auto fileSync = std::make_shared>( + [fd = fd_]() { return std::make_unique(::fsync(fd)); }); + if (executor_ != nullptr) { + executor_->add([fileSync]() { fileSync->prepare(); }); + } + // NOTE: we need to ensure cache file data sync update completes before // updating checkpoint file. const auto fileSyncRc = fileSync->move(); @@ -790,6 +793,11 @@ void SsdFile::checkpoint(bool force) { const auto checkpointFd = checkRc( ::open(checkpointPath.c_str(), O_WRONLY), "Open of checkpoint file for sync"); + // TODO: add this as file open option after we migrate to use velox + // filesystem for ssd file access. + if (disableFileCow_) { + disableCow(checkpointFd); + } VELOX_CHECK_GE(checkpointFd, 0); checkRc(::fsync(checkpointFd), "Sync of checkpoint file"); ::close(checkpointFd); @@ -822,6 +830,9 @@ void SsdFile::initializeCheckpoint() { } const auto logPath = fileName_ + kLogExtension; evictLogFd_ = ::open(logPath.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (disableFileCow_) { + disableCow(evictLogFd_); + } if (evictLogFd_ < 0) { ++stats_.openLogErrors; // Failure to open the log at startup is a process terminating error. diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 77f03534a6a14..9b027cf9cb604 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -138,6 +138,7 @@ struct SsdCacheStats { bytesCached = tsanAtomicValue(other.bytesCached); entriesAgedOut = tsanAtomicValue(other.entriesAgedOut); regionsAgedOut = tsanAtomicValue(other.regionsAgedOut); + regionsEvicted = tsanAtomicValue(other.regionsEvicted); numPins = tsanAtomicValue(other.numPins); openFileErrors = tsanAtomicValue(other.openFileErrors); @@ -162,6 +163,7 @@ struct SsdCacheStats { tsan_atomic bytesCached{0}; tsan_atomic entriesAgedOut{0}; tsan_atomic regionsAgedOut{0}; + tsan_atomic regionsEvicted{0}; tsan_atomic numPins{0}; tsan_atomic openFileErrors{0}; @@ -272,6 +274,11 @@ class SsdFile { /// Returns true if copy on write is disabled for this file. Used in testing. bool testingIsCowDisabled() const; + /// Return the SSD file path. + const std::string& fileName() const { + return fileName_; + } + private: // 4 first bytes of a checkpoint file. Allows distinguishing between format // versions. @@ -350,6 +357,9 @@ class SsdFile { // Maximum size of the backing file in kRegionSize units. const int32_t maxRegions_; + // True if copy on write should be disabled. + const bool disableFileCow_; + // Serializes access to all private data members. mutable std::shared_mutex mutex_; diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index 85fd843b86a83..4135935189baf 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -22,6 +22,13 @@ namespace facebook::velox::encoding { +// Constants defining the size in bytes of binary and encoded blocks for Base64 +// encoding. +// Size of a binary block in bytes (3 bytes = 24 bits) +constexpr static int kBinaryBlockByteSize = 3; +// Size of an encoded block in bytes (4 bytes = 24 bits) +constexpr static int kEncodedBlockByteSize = 4; + constexpr const Base64::Charset kBase64Charset = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', @@ -298,10 +305,9 @@ std::string Base64::decode(folly::StringPiece encoded) { void Base64::decode( const std::pair& payload, std::string& output) { - size_t out_len = payload.second / 4 * 3; - output.resize(out_len, '\0'); - out_len = Base64::decode(payload.first, payload.second, &output[0], out_len); - output.resize(out_len); + size_t inputSize = payload.second; + output.resize(calculateDecodedSize(payload.first, inputSize)); + decode(payload.first, inputSize, output.data(), output.size()); } // static @@ -324,51 +330,50 @@ uint8_t Base64::Base64ReverseLookup( size_t Base64::decode(const char* src, size_t src_len, char* dst, size_t dst_len) { - return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable, true); + return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable); } // static -size_t -Base64::calculateDecodedSize(const char* data, size_t& size, bool withPadding) { +size_t Base64::calculateDecodedSize(const char* data, size_t& size) { if (size == 0) { return 0; } - auto needed = (size / 4) * 3; - if (withPadding) { - // If the pad characters are included then the source string must be a - // multiple of 4 and we can query the end of the string to see how much - // padding exists. - if (size % 4 != 0) { + // Check if the input data is padded + if (isPadded(data, size)) { + // If padded, ensure that the string length is a multiple of the encoded + // block size + if (size % kEncodedBlockByteSize != 0) { throw Base64Exception( "Base64::decode() - invalid input string: " - "string length is not multiple of 4."); + "string length is not a multiple of 4."); } + auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize; auto padding = countPadding(data, size); size -= padding; - return needed - padding; + + // Adjust the needed size by deducting the bytes corresponding to the + // padding from the calculated size. + return needed - + ((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / + kEncodedBlockByteSize; } + // If not padded, Calculate extra bytes, if any + auto extra = size % kEncodedBlockByteSize; + auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize; - // If padding doesn't exist we need to calculate it from the size - if the - // size % 4 is 0 then we have an even multiple 3 byte chunks in the result - // if it is 2 then we need 1 more byte in the output. If it is 3 then we - // need 2 more bytes in the output. It should never be 1. - auto extra = size % 4; + // Adjust the needed size for extra bytes, if present if (extra) { if (extra == 1) { throw Base64Exception( "Base64::decode() - invalid input string: " "string length cannot be 1 more than a multiple of 4."); } - return needed + extra - 1; + needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize; } - // Just because we don't need the pad, doesn't mean it is not there. The - // URL decoder should be able to handle the original encoding. - auto padding = countPadding(data, size); - size -= padding; - return needed - padding; + return needed; } size_t Base64::decodeImpl( @@ -376,13 +381,12 @@ size_t Base64::decodeImpl( size_t src_len, char* dst, size_t dst_len, - const Base64::ReverseIndex& reverse_lookup, - bool include_pad) { + const ReverseIndex& reverse_lookup) { if (!src_len) { return 0; } - auto needed = calculateDecodedSize(src, src_len, include_pad); + auto needed = calculateDecodedSize(src, src_len); if (dst_len < needed) { throw Base64Exception( "Base64::decode() - invalid output string: " @@ -437,9 +441,8 @@ void Base64::decodeUrl( const char* src, size_t src_len, char* dst, - size_t dst_len, - bool hasPad) { - decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable, hasPad); + size_t dst_len) { + decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable); } std::string Base64::decodeUrl(folly::StringPiece encoded) { @@ -458,8 +461,7 @@ void Base64::decodeUrl( payload.second, &output[0], out_len, - kBase64UrlReverseIndexTable, - false); + kBase64UrlReverseIndexTable); output.resize(out_len); } } // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index 9888d97e67c54..2c7de463ea6fa 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -57,10 +57,9 @@ class Base64 { static std::string decode(folly::StringPiece encoded); - /// Returns decoded size for the specified input. Adjusts the 'size' to - /// subtract the length of the padding, if exists. - static size_t - calculateDecodedSize(const char* data, size_t& size, bool withPadding = true); + /// Returns the actual size of the decoded data. Will also remove the padding + /// length from the input data 'size'. + static size_t calculateDecodedSize(const char* data, size_t& size); /// Decodes the specified number of characters from the 'data' and writes the /// result to the 'output'. The output must have enough space, e.g. as @@ -69,7 +68,7 @@ class Base64 { static void decode( const std::pair& payload, - std::string& outp); + std::string& output); /// Encodes the specified number of characters from the 'data' and writes the /// result to the 'output'. The output must have enough space, e.g. as @@ -89,19 +88,24 @@ class Base64 { static size_t decode(const char* src, size_t src_len, char* dst, size_t dst_len); - static void decodeUrl( - const char* src, - size_t src_len, - char* dst, - size_t dst_len, - bool pad); + static void + decodeUrl(const char* src, size_t src_len, char* dst, size_t dst_len); constexpr static char kBase64Pad = '='; private: + static inline bool isPadded(const char* data, size_t len) { + return (len > 0 && data[len - 1] == kBase64Pad); + } + static inline size_t countPadding(const char* src, size_t len) { - DCHECK_GE(len, 2); - return src[len - 1] != kBase64Pad ? 0 : src[len - 2] != kBase64Pad ? 1 : 2; + size_t numPadding{0}; + while (len > 0 && src[len - 1] == kBase64Pad) { + numPadding++; + len--; + } + + return numPadding; } static uint8_t Base64ReverseLookup(char p, const ReverseIndex& table); @@ -122,8 +126,7 @@ class Base64 { size_t src_len, char* dst, size_t dst_len, - const ReverseIndex& table, - bool include_pad); + const ReverseIndex& table); }; } // namespace facebook::velox::encoding diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt index d9918d53b59c5..bc27527e14ace 100644 --- a/velox/common/encode/CMakeLists.txt +++ b/velox/common/encode/CMakeLists.txt @@ -12,5 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() + add_library(velox_encode Base64.cpp) target_link_libraries(velox_encode PUBLIC Folly::folly) diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp new file mode 100644 index 0000000000000..15556583c7519 --- /dev/null +++ b/velox/common/encode/tests/Base64Test.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/encode/Base64.h" +#include +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::encoding { +class Base64Test : public ::testing::Test {}; + +TEST_F(Base64Test, fromBase64) { + EXPECT_EQ( + "Hello, World!", + Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ=="))); + EXPECT_EQ( + "Base64 encoding is fun.", + Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4="))); + EXPECT_EQ( + "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ="))); + EXPECT_EQ( + "1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA=="))); + + // Check encoded strings without padding + EXPECT_EQ( + "Hello, World!", + Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ"))); + EXPECT_EQ( + "Base64 encoding is fun.", + Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4"))); + EXPECT_EQ( + "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ"))); + EXPECT_EQ("1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA"))); +} + +TEST_F(Base64Test, calculateDecodedSizeProperSize) { + size_t encoded_size{0}; + + encoded_size = 20; + EXPECT_EQ( + 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size)); + EXPECT_EQ(18, encoded_size); + + encoded_size = 18; + EXPECT_EQ( + 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ", encoded_size)); + EXPECT_EQ(18, encoded_size); + + encoded_size = 21; + EXPECT_THROW( + Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size), + facebook::velox::encoding::Base64Exception); + + encoded_size = 32; + EXPECT_EQ( + 23, + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size)); + EXPECT_EQ(31, encoded_size); + + encoded_size = 31; + EXPECT_EQ( + 23, + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size)); + EXPECT_EQ(31, encoded_size); + + encoded_size = 16; + EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size)); + EXPECT_EQ(14, encoded_size); + + encoded_size = 14; + EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size)); + EXPECT_EQ(14, encoded_size); +} + +} // namespace facebook::velox::encoding diff --git a/velox/expression/tests/utils/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt similarity index 70% rename from velox/expression/tests/utils/CMakeLists.txt rename to velox/common/encode/tests/CMakeLists.txt index afdae1b2789d1..e3268cb7f1b96 100644 --- a/velox/expression/tests/utils/CMakeLists.txt +++ b/velox/common/encode/tests/CMakeLists.txt @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_expression_test_utility ArgumentTypeFuzzer.cpp - FuzzerToolkit.cpp) - -target_link_libraries(velox_expression_test_utility velox_type - velox_expression_functions gtest) +add_executable(velox_common_encode_test Base64Test.cpp) +add_test(velox_common_encode_test velox_common_encode_test) +target_link_libraries( + velox_common_encode_test + PUBLIC Folly::folly + PRIVATE velox_encode velox_exception gtest gtest_main) diff --git a/velox/common/memory/MemoryAllocator.cpp b/velox/common/memory/MemoryAllocator.cpp index 3995b18419299..e4dd46457b3a4 100644 --- a/velox/common/memory/MemoryAllocator.cpp +++ b/velox/common/memory/MemoryAllocator.cpp @@ -364,15 +364,18 @@ std::string Stats::toString() const { std::stringstream out; int64_t totalClocks = 0; int64_t totalBytes = 0; + int64_t totalAllocations = 0; for (auto i = 0; i < sizes.size(); ++i) { totalClocks += sizes[i].clocks(); totalBytes += sizes[i].totalBytes; + totalAllocations += sizes[i].numAllocations; } out << fmt::format( - "Alloc: {}MB {} Gigaclocks, {}MB advised\n", + "Alloc: {}MB {} Gigaclocks {} Allocations, {}MB advised\n", totalBytes >> 20, totalClocks >> 30, - numAdvise >> 8); + numAdvise >> 8, + totalAllocations); // Sort the size classes by decreasing clocks. std::vector indices(sizes.size()); @@ -386,10 +389,11 @@ std::string Stats::toString() const { break; } out << fmt::format( - "Size {}K: {}MB {} Megaclocks\n", + "Size {}K: {}MB {} Megaclocks {} Allocations\n", sizes[i].size * 4, sizes[i].totalBytes >> 20, - sizes[i].clocks() >> 20); + sizes[i].clocks() >> 20, + sizes[i].numAllocations); } return out.str(); } diff --git a/velox/common/memory/SharedArbitrator.cpp b/velox/common/memory/SharedArbitrator.cpp index 01bd3802f6dea..ffd31daa52ca8 100644 --- a/velox/common/memory/SharedArbitrator.cpp +++ b/velox/common/memory/SharedArbitrator.cpp @@ -150,7 +150,6 @@ SharedArbitrator::SharedArbitrator(const MemoryArbitrator::Config& config) freeReservedCapacity_(reservedCapacity_), freeNonReservedCapacity_(capacity_ - freeReservedCapacity_) { VELOX_CHECK_EQ(kind_, config.kind); - updateFreeCapacityMetrics(); } std::string SharedArbitrator::Candidate::toString() const { @@ -192,14 +191,6 @@ std::vector SharedArbitrator::getCandidateStats( return candidates; } -void SharedArbitrator::updateFreeCapacityMetrics() const { - RECORD_METRIC_VALUE( - kMetricArbitratorFreeCapacityBytes, - freeNonReservedCapacity_ + freeReservedCapacity_); - RECORD_METRIC_VALUE( - kMetricArbitratorFreeReservedCapacityBytes, freeReservedCapacity_); -} - int64_t SharedArbitrator::maxReclaimableCapacity(const MemoryPool& pool) const { return std::max(0, pool.capacity() - memoryPoolReservedCapacity_); } @@ -226,8 +217,6 @@ int64_t SharedArbitrator::minGrowCapacity(const MemoryPool& pool) const { uint64_t SharedArbitrator::growCapacity( MemoryPool* pool, uint64_t targetBytes) { - const auto freeCapacityMetricUpdateCb = - folly::makeGuard([this]() { updateFreeCapacityMetrics(); }); uint64_t reservedBytes{0}; { std::lock_guard l(mutex_); @@ -276,9 +265,6 @@ uint64_t SharedArbitrator::decrementFreeCapacityLocked( uint64_t SharedArbitrator::shrinkCapacity( MemoryPool* pool, uint64_t targetBytes) { - const auto freeCapacityUpdateCb = - folly::makeGuard([this]() { updateFreeCapacityMetrics(); }); - uint64_t freedBytes{0}; { std::lock_guard l(mutex_); @@ -294,9 +280,6 @@ uint64_t SharedArbitrator::shrinkCapacity( uint64_t targetBytes, bool allowSpill, bool allowAbort) { - const auto freeCapacityUpdateCb = - folly::makeGuard([this]() { updateFreeCapacityMetrics(); }); - ScopedArbitration scopedArbitration(this); if (targetBytes == 0) { targetBytes = capacity_; @@ -345,9 +328,6 @@ bool SharedArbitrator::growCapacity( MemoryPool* pool, const std::vector>& candidatePools, uint64_t targetBytes) { - const auto freeCapacityUpdateCb = - folly::makeGuard([this]() { updateFreeCapacityMetrics(); }); - ScopedArbitration scopedArbitration(pool, this); MemoryPool* requestor = pool->root(); if (requestor->aborted()) { diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h index 50dc8c015d188..865e44ac269a3 100644 --- a/velox/common/memory/SharedArbitrator.h +++ b/velox/common/memory/SharedArbitrator.h @@ -239,11 +239,6 @@ class SharedArbitrator : public memory::MemoryArbitrator { // the reserved capacity as specified by 'memoryPoolReservedCapacity_'. int64_t minGrowCapacity(const MemoryPool& pool) const; - // Updates the free capacity metrics on capacity changes. - // - // TODO: move this update to velox runtime monitoring service once available. - void updateFreeCapacityMetrics() const; - mutable std::mutex mutex_; tsan_atomic freeReservedCapacity_{0}; tsan_atomic freeNonReservedCapacity_{0}; diff --git a/velox/common/memory/tests/MemoryAllocatorTest.cpp b/velox/common/memory/tests/MemoryAllocatorTest.cpp index 86133a88e2bf3..4bfff3f4ffada 100644 --- a/velox/common/memory/tests/MemoryAllocatorTest.cpp +++ b/velox/common/memory/tests/MemoryAllocatorTest.cpp @@ -632,10 +632,42 @@ TEST_P(MemoryAllocatorTest, allocationClass2) { allocation->clear(); } +TEST_P(MemoryAllocatorTest, stats) { + const std::vector& sizes = instance_->sizeClasses(); + MachinePageCount capacity = kCapacityPages; + for (auto i = 0; i < sizes.size(); ++i) { + std::unique_ptr allocation = std::make_unique(); + auto size = sizes[i]; + ASSERT_TRUE(allocate(size, *allocation)); + ASSERT_GT(instance_->numAllocated(), 0); + instance_->freeNonContiguous(*allocation); + auto stats = instance_->stats(); + ASSERT_EQ(0, stats.sizes[i].clocks()); + ASSERT_EQ(stats.sizes[i].totalBytes, 0); + ASSERT_EQ(stats.sizes[i].numAllocations, 0); + } + + gflags::FlagSaver flagSaver; + FLAGS_velox_time_allocations = true; + for (auto i = 0; i < sizes.size(); ++i) { + std::unique_ptr allocation = std::make_unique(); + auto size = sizes[i]; + ASSERT_TRUE(allocate(size, *allocation)); + ASSERT_GT(instance_->numAllocated(), 0); + instance_->freeNonContiguous(*allocation); + auto stats = instance_->stats(); + ASSERT_LT(0, stats.sizes[i].clocks()); + ASSERT_GE(stats.sizes[i].totalBytes, size * AllocationTraits::kPageSize); + ASSERT_GE(stats.sizes[i].numAllocations, 1); + } +} + TEST_P(MemoryAllocatorTest, singleAllocation) { if (!useMmap_ && enableReservation_) { return; } + gflags::FlagSaver flagSaver; + FLAGS_velox_time_allocations = true; const std::vector& sizes = instance_->sizeClasses(); MachinePageCount capacity = kCapacityPages; for (auto i = 0; i < sizes.size(); ++i) { diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 2fadb1fd8acdd..d8cfe275fd71a 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -436,12 +436,16 @@ std::unique_ptr parseSerdeParameters( auto mapKeyIt = serdeParameters.find(dwio::common::SerDeOptions::kMapKeyDelim); + auto escapeCharIt = + serdeParameters.find(dwio::common::SerDeOptions::kEscapeChar); + auto nullStringIt = tableParameters.find( dwio::common::TableParameter::kSerializationNullFormat); if (fieldIt == serdeParameters.end() && collectionIt == serdeParameters.end() && mapKeyIt == serdeParameters.end() && + escapeCharIt == serdeParameters.end() && nullStringIt == tableParameters.end()) { return nullptr; } @@ -458,8 +462,19 @@ std::unique_ptr parseSerdeParameters( if (mapKeyIt != serdeParameters.end()) { mapKeyDelim = parseDelimiter(mapKeyIt->second); } - auto serDeOptions = std::make_unique( - fieldDelim, collectionDelim, mapKeyDelim); + + uint8_t escapeChar; + bool hasEscapeChar = false; + if (escapeCharIt != serdeParameters.end() && !escapeCharIt->second.empty()) { + hasEscapeChar = true; + escapeChar = escapeCharIt->second[0]; + } + + auto serDeOptions = hasEscapeChar + ? std::make_unique( + fieldDelim, collectionDelim, mapKeyDelim, escapeChar, true) + : std::make_unique( + fieldDelim, collectionDelim, mapKeyDelim); if (nullStringIt != tableParameters.end()) { serDeOptions->nullString = nullStringIt->second; } @@ -553,7 +568,10 @@ void configureRowReaderOptions( } else { cs = std::make_shared(rowType, columnNames); } - rowReaderOptions.select(cs).range(hiveSplit->start, hiveSplit->length); + rowReaderOptions.select(cs); + if (hiveSplit) { + rowReaderOptions.range(hiveSplit->start, hiveSplit->length); + } } namespace { diff --git a/velox/core/Config.cpp b/velox/core/Config.cpp index 8ccd7a8ccbd28..4465bca74f8ed 100644 --- a/velox/core/Config.cpp +++ b/velox/core/Config.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ #include "velox/core/Config.h" +#include "velox/core/QueryConfig.h" +#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::core { @@ -46,4 +48,11 @@ bool MemConfigMutable::isValueExists(const std::string& key) const { return lockedValues->find(key) != lockedValues->end(); } +void MemConfig::validateConfig() { + // Validate if timezone name can be recognized. + if (isValueExists(QueryConfig::kSessionTimezone)) { + util::getTimeZoneID(values_[QueryConfig::kSessionTimezone]); + } +} + } // namespace facebook::velox::core diff --git a/velox/core/Config.h b/velox/core/Config.h index 11ccea060588d..2dc705e937363 100644 --- a/velox/core/Config.h +++ b/velox/core/Config.h @@ -70,12 +70,16 @@ namespace core { class MemConfig : public Config { public: explicit MemConfig(const std::unordered_map& values) - : values_(values) {} + : values_(values) { + validateConfig(); + } explicit MemConfig() : values_{} {} explicit MemConfig(std::unordered_map&& values) - : values_(std::move(values)) {} + : values_(std::move(values)) { + validateConfig(); + } folly::Optional get(const std::string& key) const override; @@ -90,6 +94,9 @@ class MemConfig : public Config { } private: + // Validate if configurations are valid. + void validateConfig(); + std::unordered_map values_; }; diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index c14a596322566..dfba399d967e7 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -158,7 +158,7 @@ class PlanNode : public ISerializable { /// 'addContext' is not null. /// /// @param addContext Optional lambda to add context for a given plan node. - /// Receives plan node ID, indentation and std::stringstring where to append + /// Receives plan node ID, indentation and std::stringstream where to append /// the context. Use indentation for second and subsequent lines of a /// mult-line context. Do not use indentation for single-line context. Do not /// add trailing new-line character for the last or only line of context. diff --git a/velox/core/SimpleFunctionMetadata.h b/velox/core/SimpleFunctionMetadata.h index ccf80e43c3c3b..9fcb0ad60ac3e 100644 --- a/velox/core/SimpleFunctionMetadata.h +++ b/velox/core/SimpleFunctionMetadata.h @@ -20,6 +20,7 @@ #include #include "velox/common/base/Exceptions.h" +#include "velox/common/base/Status.h" #include "velox/core/CoreTypeSystem.h" #include "velox/core/Metaprogramming.h" #include "velox/core/QueryConfig.h" @@ -687,17 +688,33 @@ class UDFHolder { bool, exec_return_type, const exec_arg_type&...>::value; + static constexpr bool udf_has_call_return_void = util::has_method< Fun, call_method_resolver, void, exec_return_type, const exec_arg_type&...>::value; - static constexpr bool udf_has_call = - udf_has_call_return_bool | udf_has_call_return_void; + + static constexpr bool udf_has_call_return_status = util::has_method< + Fun, + call_method_resolver, + Status, + exec_return_type, + const exec_arg_type&...>::value; + + static constexpr bool udf_has_call = udf_has_call_return_bool | + udf_has_call_return_void | udf_has_call_return_status; + static_assert( !(udf_has_call_return_bool && udf_has_call_return_void), - "Provided call() methods need to return either void OR bool."); + "Provided call() methods need to return either void OR bool OR status."); + static_assert( + !(udf_has_call_return_bool && udf_has_call_return_status), + "Provided call() methods need to return either void OR bool OR status."); + static_assert( + !(udf_has_call_return_void && udf_has_call_return_status), + "Provided call() methods need to return either void OR bool OR status."); // callNullable(): static constexpr bool udf_has_callNullable_return_bool = util::has_method< @@ -863,13 +880,14 @@ class UDFHolder { } } - FOLLY_ALWAYS_INLINE bool call( + FOLLY_ALWAYS_INLINE Status call( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type&... args) { if constexpr (udf_has_call) { - return callImpl(out, args...); + return callImpl(out, notNull, args...); } else if constexpr (udf_has_callNullable) { - return callNullableImpl(out, (&args)...); + return callNullableImpl(out, notNull, (&args)...); } else { VELOX_UNREACHABLE( "call should never be called if the UDF does not " @@ -877,18 +895,20 @@ class UDFHolder { } } - FOLLY_ALWAYS_INLINE bool callNullable( + FOLLY_ALWAYS_INLINE Status callNullable( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type*... args) { if constexpr (udf_has_callNullable) { - return callNullableImpl(out, args...); + return callNullableImpl(out, notNull, args...); } else if constexpr (udf_has_call) { // Default null behavior. const bool isAllSet = (args && ...); if (LIKELY(isAllSet)) { - return callImpl(out, (*args)...); + return callImpl(out, notNull, (*args)...); } else { - return false; + notNull = false; + return Status::OK(); } } else { VELOX_UNREACHABLE( @@ -897,21 +917,23 @@ class UDFHolder { } } - FOLLY_ALWAYS_INLINE bool callAscii( + FOLLY_ALWAYS_INLINE Status callAscii( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type&... args) { if constexpr (udf_has_callAscii) { - return callAsciiImpl(out, args...); + return callAsciiImpl(out, notNull, args...); } else { - return call(out, args...); + return call(out, notNull, args...); } } - FOLLY_ALWAYS_INLINE bool callNullFree( + FOLLY_ALWAYS_INLINE Status callNullFree( exec_return_type& out, + bool& notNull, const exec_no_nulls_arg_type&... args) { if constexpr (udf_has_callNullFree) { - return callNullFreeImpl(out, args...); + return callNullFreeImpl(out, notNull, args...); } else { VELOX_UNREACHABLE( "callNullFree should never be called if the UDF does not implement callNullFree."); @@ -920,52 +942,66 @@ class UDFHolder { // Helper functions to handle void vs bool return type. - FOLLY_ALWAYS_INLINE bool callImpl( + FOLLY_ALWAYS_INLINE Status callImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const typename Exec::template resolver::in_type&... args) { static_assert(udf_has_call); - if constexpr (udf_has_call_return_bool) { + + if constexpr (udf_has_call_return_status) { + notNull = true; return instance_.call(out, args...); + } else if constexpr (udf_has_call_return_bool) { + notNull = instance_.call(out, args...); + return Status::OK(); } else { instance_.call(out, args...); - return true; + notNull = true; + return Status::OK(); } } - FOLLY_ALWAYS_INLINE bool callNullableImpl( + FOLLY_ALWAYS_INLINE Status callNullableImpl( exec_return_type& out, + bool& notNull, const typename Exec::template resolver::in_type*... args) { static_assert(udf_has_callNullable); if constexpr (udf_has_callNullable_return_bool) { - return instance_.callNullable(out, args...); + notNull = instance_.callNullable(out, args...); + return Status::OK(); } else { instance_.callNullable(out, args...); - return true; + notNull = true; + return Status::OK(); } } - FOLLY_ALWAYS_INLINE bool callAsciiImpl( + FOLLY_ALWAYS_INLINE Status callAsciiImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const typename Exec::template resolver::in_type&... args) { static_assert(udf_has_callAscii); if constexpr (udf_has_callAscii_return_bool) { - return instance_.callAscii(out, args...); + notNull = instance_.callAscii(out, args...); } else { instance_.callAscii(out, args...); - return true; + notNull = true; } + return Status::OK(); } - FOLLY_ALWAYS_INLINE bool callNullFreeImpl( + FOLLY_ALWAYS_INLINE Status callNullFreeImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const exec_no_nulls_arg_type&... args) { static_assert(udf_has_callNullFree); if constexpr (udf_has_callNullFree_return_bool) { - return instance_.callNullFree(out, args...); + notNull = instance_.callNullFree(out, args...); } else { instance_.callNullFree(out, args...); - return true; + notNull = true; } + return Status::OK(); } }; diff --git a/velox/core/tests/QueryConfigTest.cpp b/velox/core/tests/QueryConfigTest.cpp index 67e87fe5cd349..9955bbf34cef1 100644 --- a/velox/core/tests/QueryConfigTest.cpp +++ b/velox/core/tests/QueryConfigTest.cpp @@ -46,8 +46,23 @@ TEST_F(QueryConfigTest, setConfig) { ASSERT_TRUE(config.isLegacyCast()); } +TEST_F(QueryConfigTest, invalidConfig) { + std::unordered_map configData( + {{QueryConfig::kSessionTimezone, "Invalid"}}); + VELOX_ASSERT_USER_THROW( + std::make_shared(nullptr, std::move(configData)), + "Unknown time zone: 'Invalid'"); + + auto queryCtx = std::make_shared(nullptr); + VELOX_ASSERT_USER_THROW( + queryCtx->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSessionTimezone, ""}, + }), + "Unknown time zone: ''"); +} + TEST_F(QueryConfigTest, memConfig) { - const std::string tz = "timezone1"; + const std::string tz = "UTC"; const std::unordered_map configData( {{QueryConfig::kSessionTimezone, tz}}); @@ -72,7 +87,7 @@ TEST_F(QueryConfigTest, memConfig) { tz, cfg.Config::get(QueryConfig::kSessionTimezone).value()); ASSERT_FALSE(cfg.Config::get("missing-entry").has_value()); - const std::string tz2 = "timezone2"; + const std::string tz2 = "PST"; ASSERT_NO_THROW(cfg.setValue(QueryConfig::kSessionTimezone, tz2)); ASSERT_EQ( tz2, diff --git a/velox/docs/develop/testing/fuzzer.rst b/velox/docs/develop/testing/fuzzer.rst index 9aa7630f07a4b..639acda6ee5e5 100644 --- a/velox/docs/develop/testing/fuzzer.rst +++ b/velox/docs/develop/testing/fuzzer.rst @@ -141,6 +141,8 @@ tested: Total aggregations verified against DuckDB: 2537 (44.63%) Total failed aggregations: 1061 (18.67%) +.. _window-fuzzer: + Window Fuzzer ------------- @@ -284,7 +286,7 @@ When Fuzzer test fails, a seed number and the evaluated expression are printed to the log. An example is given below. Developers can use ``--seed`` with this seed number to rerun the exact same expression with the same inputs, and use a debugger to investigate the issue. For the example below, the command -to reproduce the error would be ``velox/expression/tests/velox_expression_fuzzer_test --seed 1188545576``. +to reproduce the error would be ``velox/expression/fuzzer/velox_expression_fuzzer_test --seed 1188545576``. :: diff --git a/velox/docs/develop/testing/join-fuzzer.rst b/velox/docs/develop/testing/join-fuzzer.rst index be7d61a467bfc..1bbfbfc7df41b 100644 --- a/velox/docs/develop/testing/join-fuzzer.rst +++ b/velox/docs/develop/testing/join-fuzzer.rst @@ -42,7 +42,7 @@ Use velox_join_fuzzer_test binary to run join fuzzer: velox/exec/tests/velox_join_fuzzer_test -By default, the fuzzer will go through 10 interations. Use --steps +By default, the fuzzer will go through 10 iterations. Use --steps or --duration-sec flag to run fuzzer for longer. Use --seed to reproduce fuzzer failures. diff --git a/velox/docs/develop/testing/row-number-fuzzer.rst b/velox/docs/develop/testing/row-number-fuzzer.rst new file mode 100644 index 0000000000000..6f304a50f72b8 --- /dev/null +++ b/velox/docs/develop/testing/row-number-fuzzer.rst @@ -0,0 +1,55 @@ +================ +RowNumber Fuzzer +================ + +The RowNumberFuzzer is a testing tool that automatically generate equivalent query plans and then executes these plans +to validate the consistency of the results. It works as follows: + +1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can + have a variety of encodings and data layouts to ensure thorough testing. +2. Plan Generation: Generate two equivalent query plans, one is row-number over ValuesNode as the base plan. + and the other is over TableScanNode as the alter plan. +3. Query Execution: Executes those equivalent query plans using the generated data and asserts that the results are + consistent across different plans. + i. Execute the base plan, compare the result with the reference (DuckDB or Presto) and use it as the expected result. + #. Execute the alter plan multiple times with and without spill, and compare each result with the + expected result. +4. Iteration: This process is repeated multiple times to ensure reliability and robustness. + +How to run +---------- + +Use velox_row_number_fuzzer_test binary to run rowNumber fuzzer: + +:: + + velox/exec/tests/velox_row_number_fuzzer_test --seed 123 --duration_sec 60 + +By default, the fuzzer will go through 10 iterations. Use --steps +or --duration-sec flag to run fuzzer for longer. Use --seed to +reproduce fuzzer failures. + +Here is a full list of supported command line arguments. + +* ``–-steps``: How many iterations to run. Each iteration generates and + evaluates one expression or aggregation. Default is 10. + +* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps`` + and ``-–duration_sec`` are specified, –duration_sec takes precedence. + +* ``–-seed``: The seed to generate random expressions and input vectors with. + +* ``–-v=1``: Verbose logging (from `Google Logging Library `_). + +* ``–-batch_size``: The size of input vectors to generate. Default is 100. + +* ``--num_batches``: The number of input vectors of size `--batch_size` to + generate. Default is 5. + +* ``--enable_spill``: Whether to test with spilling or not. Default is true. + +* ``--presto_url`` The PrestoQueryRunner url along with its port number. + +* ``--req_timeout_ms`` Timeout in milliseconds of an HTTP request to the PrestoQueryRunner. + +If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index e1d3615c7283e..52cf9969f0a3c 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -8,7 +8,26 @@ Binary Functions .. function:: from_base64(string) -> varbinary - Decodes binary data from the base64 encoded ``string``. + Decodes a Base64-encoded ``string`` back into its original binary form. + This function is capable of handling both fully padded and non-padded Base64 encoded strings. + Partially padded Base64 strings are not supported and will result in an error. + + Examples + -------- + Query with padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ='); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with non-padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with partial-padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- Error : Base64::decode() - invalid input string: string length is not a multiple of 4. + + In the above examples, both the fully padded and non-padded Base64 strings ('SGVsbG8gV29ybGQ=' and 'SGVsbG8gV29ybGQ') decode to the binary representation of the text 'Hello World'. + While, partial-padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will lead to an velox error. .. function:: from_base64url(string) -> varbinary diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 5472b5a2c8dbf..708a3349b93b1 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -217,10 +217,24 @@ These functions support TIMESTAMP and DATE input types. .. spark:function:: second(timestamp) -> integer - Returns the seconds of ``timestamp``.:: + Returns the seconds of ``timestamp``. :: SELECT second('2009-07-30 12:58:59'); -- 59 +.. spark:function:: timestamp_micros(x) -> timestamp + + Returns timestamp from the number of microseconds since UTC epoch. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.:: + + SELECT timestamp_micros(1230219000123123); -- '2008-12-25 15:30:00.123123' + +.. spark:function:: timestamp_millis(x) -> timestamp + + Returns timestamp from the number of milliseconds since UTC epoch. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.:: + + SELECT timestamp_millis(1230219000123); -- '2008-12-25 15:30:00.123' + .. spark:function:: to_unix_timestamp(string) -> integer Alias for ``unix_timestamp(string) -> integer``. @@ -238,12 +252,31 @@ These functions support TIMESTAMP and DATE input types. .. spark:function:: unix_date(date) -> integer - Returns the number of days since 1970-01-01.:: + Returns the number of days since 1970-01-01. :: SELECT unix_date('1970-01-01'); -- '0' SELECT unix_date('1970-01-02'); -- '1' SELECT unix_date('1969-12-31'); -- '-1' +.. spark:function:: unix_micros(timestamp) -> bigint + + Returns the number of microseconds since 1970-01-01 00:00:00 UTC.:: + + SELECT unix_micros('1970-01-01 00:00:01'); -- 1000000 + +.. spark:function:: unix_millis(timestamp) -> bigint + + Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates + higher levels of precision.:: + + SELECT unix_millis('1970-01-01 00:00:01'); -- 1000 + +.. spark:function:: unix_seconds(timestamp) -> bigint + + Returns the number of seconds since 1970-01-01 00:00:00 UTC. :: + + SELECT unix_seconds('1970-01-01 00:00:01'); -- 1 + .. spark:function:: unix_timestamp() -> integer Returns the current UNIX timestamp in seconds. @@ -272,7 +305,7 @@ These functions support TIMESTAMP and DATE input types. .. function:: weekday(date) -> integer - Returns the day of the week for date (0 = Monday, 1 = Tuesday, …, 6 = Sunday).:: + Returns the day of the week for date (0 = Monday, 1 = Tuesday, …, 6 = Sunday). :: SELECT weekday('2015-04-08'); -- 2 SELECT weekday('2024-02-10'); -- 5 diff --git a/velox/docs/monthly-updates.rst b/velox/docs/monthly-updates.rst index 5945ec03d90a6..a34d51fedcff9 100644 --- a/velox/docs/monthly-updates.rst +++ b/velox/docs/monthly-updates.rst @@ -5,6 +5,9 @@ Monthly Updates .. toctree:: :maxdepth: 1 + monthly-updates/april-2024 + monthly-updates/march-2024 + monthly-updates/february-2024 monthly-updates/january-2024 monthly-updates/2023/index monthly-updates/2022/index diff --git a/velox/docs/monthly-updates/april-2024.rst b/velox/docs/monthly-updates/april-2024.rst new file mode 100644 index 0000000000000..919e8551bdbb1 --- /dev/null +++ b/velox/docs/monthly-updates/april-2024.rst @@ -0,0 +1,63 @@ +***************** +April 2024 Update +***************** + +Documentation +============= + +* Document operations on decimals for :doc:`Presto ` + and :doc:`Spark `. +* Document spill write stats. :pr:`9326` + +Core Library +============ + +* Fix bugs in Window operator. :pr:`9476`, :pr:`9271`, :pr:`9257` + +Presto Functions +================ + +* Add :func:`word_stem` and :func:`to_iso8601` scalar functions. +* Add support for DECIMAL inputs to :func:`arbitrary`, :func:`min` and :func:`max` aggregate functions. +* Fix :func:`json_extract` for paths with wildcards. + +Spark Functions +=============== + +* Add :spark:func:`array_size`, :spark:func:`flatten`, :spark:func:`year_of_week` scalar functions. +* Add :spark:func:`collect_list` and :spark:func:`regr_replacement` aggregate functions. + +Hive Connector +============== + +* Add support for storing decimal as integer in Parquet writer. +* Add hive.s3.connect-timeout, hive.s3.socket-timeout and hive.s3.max-connections configs. :pr:`9472` +* Fix complex type handing in Parquet reader. :pr:`9187` +* Fix DWRF reader to skip null map keys. + +Performance and Correctness +=========================== + +* Add aggregation and window fuzzer runs to every PR. +* Add nightly run of window fuzzer. +* Add check for aggregate function signature changes to every PR. +* Add biased aggregation fuzzer run for newly added aggregate functions to every PR. + +Build System +============ + +* Add nightly job to track build metrics. + +Credits +======= + +Andres Suarez, Andrii Rosa, Ankita Victor, Ashwin Krishna Kumar, Bikramjeet Vig, +Christian Zentgraf, Daniel Munoz, David McKnight, Deepak Majeti, Hengzhi Chen, +Huameng (Michael) Jiang, Jacob Wujciak-Jens, Jeongseok Lee, Jialiang Tan, Jimmy +Lu, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Lu Niu, Ludovic Henry, Ma, +Rong, Mahadevuni Naveen Kumar, Masha Basmanova, Mike Lui, Minhan Cao, PHILO-HE, +Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Qian Sun, Richard Barnes, +Sergey Pershin, Shabab Ayub, Tengfei Huang, Terry Wang, Wei He, Weitao Wan, +Wills Feng, Yang Zhang, Yihong Wang, Yoav Helfman, Zac Wen, Zhenyuan Zhao, +aditi-pandit, chliang, cindyyyang, duanmeng, jay.narale, joey.ljy, mohsaka, +rui-mo, svm1, willsfeng, wutiangan, wypb, xiaoxmeng, yingsu00, zhli1142015 diff --git a/velox/docs/monthly-updates/february-2024.rst b/velox/docs/monthly-updates/february-2024.rst new file mode 100644 index 0000000000000..18704d2bb5ac4 --- /dev/null +++ b/velox/docs/monthly-updates/february-2024.rst @@ -0,0 +1,68 @@ +******************** +February 2024 Update +******************** + +Core Library +============ + +* Add support for aggregations over distinct inputs to StreamingAggregation. +* Add support for deserializing a single column in Presto page format. +* Add support for deserializing an all-null column serialized as UNKNOWN type in Presto page format. +* Add stats for null skew in join operator. +* Convert TIMESTAMP_WITH_TIME_ZONE type to a primitive type. +* Add background profiler that starts Linux perf on the Velox process. +* Fix ``out of range in dynamic array`` error in Task::toJson. +* Delete unused ``max_arbitrary_buffer_size`` config. + +Presto Functions +================ + +* Add :func:`typeof`, :func:`from_iso8601_date` scalar functions. +* Add support for DECIMAL input type to :func:`set_agg` and :func:`set_union` aggregate functions. +* Add support for UNKNOWN input type to :func:`checksum` aggregate function. +* Add support for DATE +/- INTERVAL YEAR MONTH functions. +* Add support for ``UCT|UCT|GMT|GMT0`` as ``Z`` to :func:`parse_datetime` scalar function. + +Spark Functions +=============== + +* Add :spark:func:`array_repeat`, :spark:func:`date_from_unix_date`, :spark:func:`weekday`, :spark:func:`minute`, :spark:func:`second` scalar functions. +* Add :spark:func:`ntile` window function. + +Hive Connector +============== + +* Add ``ignore_missing_files`` config. +* Add write support to ABFS file system. +* Add support for proxy to S3 file system. + +Arrow +===== + +* Add support to export UNKNOWN type to Arrow array. +* Add support to convert Arrow REE arrays to Velox Vectors. + +Performance and Correctness +=========================== + +* Add FieldReference benchmark. +* Add :ref:`Window fuzzer `. +* Fix ``Too many open files`` error in Join fuzzer. + +Build System +============ + +* Add ``VELOX_BUILD_MINIMAL_WITH_DWIO`` CMake option. +* Move documentation, header and format check to Github Action. + +Credits +======= + +Aaron Feldman, Ankita Victor, Bikramjeet Vig, Christian Zentgraf, Daniel Munoz, +David McKnight, Deepak Majeti, Ge Gao, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke, +Jialiang Tan, Jimmy Lu, Kevin Wilfong, Krishna Pai, Lu Niu, Masha Basmanova, +Nick Terrell, Orri Erling, PHILO-HE, Pedro Pedreira, Pramod, Pranjal Shankhdhar, +Richard Barnes, Schierbeck, Cody, Sergey Pershin, Wei He, Yedidya Feldblum, +Zac Wen, Zhenyuan Zhao, aditi-pandit, duanmeng, gayangya, hengjiang.ly, hitarth, +lingbin, mwish, rrando901, rui-mo, xiaodou, xiaoxmeng, xumingming, yingsu00, +zhli1142015, 高阳阳 diff --git a/velox/docs/monthly-updates/march-2024.rst b/velox/docs/monthly-updates/march-2024.rst new file mode 100644 index 0000000000000..636f52ed3a4f4 --- /dev/null +++ b/velox/docs/monthly-updates/march-2024.rst @@ -0,0 +1,79 @@ +***************** +March 2024 Update +***************** + +Documentation +============= + +* Document `design philosophy `_ +* Document custom input generators and verifiers supported in the Aggregation Fuzzer. +* Document runtime stats reported by the HashTable. :pr:`9255` +* Document usage of generic types in Simple Function API. :pr:`9084` + +Core Library +============ + +* Add prefix-sort for fixed width sorting keys. +* Add null behavior and determinism scalar function metadata to the registry. :pr:`9209` +* Add order-sensitive aggregate function metadata to the registry. :pr:`9050` +* Add support for DECIMAL type to Simple Function API. :pr:`9096` +* Add support for lambda functions (reduce_agg) to StreamingAggregation. +* Deprecate threshold based spilling in Aggregation and OrderBy. +* Optimize Exchange protocol used by Presto for latency. :pr:`8845` + +Presto Functions +================ + +* Add :func:`day`, :func:`from_ieee754_32`, :func:`hamming_distance`, :func:`map_normalize`, + :func:`map_top_n` scalar functions. +* Add support for DECIMAL input type to :func:`floor` function. +* Add support for timestamp +/- IntervalYearMonth. +* Add :func:`regr_avgx`, :func:`regr_avgy`, :func:`regr_count`, :func:`regr_r2`, + :func:`regr_sxx`, :func:`regr_sxy`, and :func:`regr_syy` aggregation functions. + +Spark Functions +=============== + +* Add :spark:func:`array_remove`, :spark:func:`bit_length`, :spark:func:`bitwise_xor`, + :spark:func:`bitwise_not`, :spark:func:`make_ym_interval`, :spark:func:`from_utc_timestamp`, + :spark:func:`to_utc_timestamp`, :spark:func:`make_timestamp`, :spark:func:`map_subset`, + :spark:func:`unhex`, :spark:func:`unix_date`, :spark:func:`uuid` functions. +* Add :spark:func:`regexp_replace` function. +* Add :spark:func:`monotonically_increasing_id`, :spark:func:`spark_partition_id` functions. +* Add :spark:func:`kurtosis` and :spark:func:`skewness` aggregation functions. +* Add support for DECIMAL inputs to :spark:func:`sum` aggregation function. +* Add CAST(real as decimal). +* Add configuration property 'spark.partition_id'. + +Hive Connector +============== + +* Add support for S3 client no_proxy CIDR expression. :pr:`9160` +* Add support for synthetic columns '$file_size' and '$file_modified_time'. +* Optimize reading a small sample of rows. :pr:`8920`. +* Fix Parquet reader for files with different encodings across row groups. :pr:`9129` + +Performance and Correctness +=========================== + +* Add nightly run of Aggregation fuzzer using Presto as source of truth. +* Add nightly run of Exchange fuzzer. +* Add utility to randomly trigger OOMs and integrate it into Aggregation and Join fuzzers. +* Add group execution mode to Join fuzzer. +* Add support for random frame clause generation to Window fuzzer. +* Add custom input generator for map_union_sum Presto aggregation function. +* Add custom result verifier for arbitrary Presto aggregation function. + +Credits +======= + +8dukongjian, Amit Dutta, Ankita Victor, Bikramjeet Vig, Christian Zentgraf, +Daniel Munoz, Deepak Majeti, Ge Gao, InitialZJ, Jacob Wujciak-Jens, Jake Jung, +Jialiang Tan, Jimmy Lu, Karteekmurthys, Kevin Wilfong, Krishna Pai, Ma, Rong, +Mahadevuni Naveen Kumar, Marcus D. Hanwell, Masha Basmanova, Nicholas Ormrod, +Nick Terrell, Orri Erling, PHILO-HE, Patrick Sullivan, Pedro Pedreira, Pramod, +Pratik Joseph Dabre, Qian Sun, Richard Barnes, Sandino Flores, Schierbeck, +Cody, Sergey Pershin, Ubuntu, Wei He, Yang Zhang, Zac Wen, aditi-pandit, +duanmeng, f0rest9999, hengjiang.ly, joey.ljy, lingbin, mwish, rexan, rui-mo, +willsfeng, wypb, xiaodai1002, xiaoxmeng, xumingming, youxiduo, yuling.sh, +zhli1142015, zky.zhoukeyong diff --git a/velox/dwio/common/CacheInputStream.cpp b/velox/dwio/common/CacheInputStream.cpp index 98c36e457b606..462a6ba8b87e9 100644 --- a/velox/dwio/common/CacheInputStream.cpp +++ b/velox/dwio/common/CacheInputStream.cpp @@ -127,7 +127,13 @@ void CacheInputStream::seekToPosition(PositionProvider& seekPosition) { } std::string CacheInputStream::getName() const { - return fmt::format("CacheInputStream {} of {}", position_, region_.length); + std::string result = + fmt::format("CacheInputStream {} of {}", position_, region_.length); + auto ssdFile = ssdFileName(); + if (!ssdFile.empty()) { + result += fmt::format(" ssdFile={}", ssdFile); + } + return result; } size_t CacheInputStream::positionSize() { @@ -285,6 +291,14 @@ bool CacheInputStream::loadFromSsd( return true; } +std::string CacheInputStream::ssdFileName() const { + auto ssdCache = cache_->ssdCache(); + if (!ssdCache) { + return ""; + } + return ssdCache->file(fileNum_).fileName(); +} + void CacheInputStream::loadPosition() { auto offset = region_.offset; if (pin_.empty()) { diff --git a/velox/dwio/common/CacheInputStream.h b/velox/dwio/common/CacheInputStream.h index 5a99f5b35c829..6b95bf3713bba 100644 --- a/velox/dwio/common/CacheInputStream.h +++ b/velox/dwio/common/CacheInputStream.h @@ -109,6 +109,10 @@ class CacheInputStream : public SeekableInputStream { velox::common::Region region, cache::AsyncDataCacheEntry& entry); + // Return SSD cache file path if exists; return empty string if no SSD cache + // file. + std::string ssdFileName() const; + CachedBufferedInput* const bufferedInput_; cache::AsyncDataCache* const cache_; IoStatistics* ioStats_; diff --git a/velox/dwio/common/ColumnVisitors.h b/velox/dwio/common/ColumnVisitors.h index 3ac3d5e219f74..cc81f4505fee0 100644 --- a/velox/dwio/common/ColumnVisitors.h +++ b/velox/dwio/common/ColumnVisitors.h @@ -49,29 +49,32 @@ struct DropValues { } }; -template struct ExtractToReader { using HookType = dwio::common::NoHook; static constexpr bool kSkipNulls = false; - explicit ExtractToReader(TReader* readerIn) : reader(readerIn) {} + explicit ExtractToReader(SelectiveColumnReader* readerIn) + : reader_(readerIn) {} bool acceptsNulls() const { return true; } template - void addNull(vector_size_t rowIndex); + void addNull(vector_size_t /*rowIndex*/) { + reader_->template addNull(); + } template void addValue(vector_size_t /*rowIndex*/, V value) { - reader->addValue(value); + reader_->addValue(value); } - TReader* reader; - dwio::common::NoHook& hook() { return noHook(); } + + private: + SelectiveColumnReader* reader_; }; template @@ -150,6 +153,7 @@ class ColumnVisitor { using DataType = T; static constexpr bool dense = isDense; static constexpr bool kHasBulkPath = true; + ColumnVisitor( TFilter& filter, SelectiveColumnReader* reader, @@ -163,6 +167,20 @@ class ColumnVisitor { rowIndex_(0), values_(values) {} + template = 0> + ColumnVisitor( + TFilter& filter, + SelectiveColumnReader* reader, + vector_size_t numRows, + ExtractValues values) + : filter_(filter), + reader_(reader), + allowNulls_(!TFilter::deterministic || filter.testNull()), + rows_(nullptr), + numRows_(numRows), + rowIndex_(0), + values_(values) {} + bool allowNulls() { if (ExtractValues::kSkipNulls && TFilter::deterministic) { return false; @@ -269,7 +287,7 @@ class ColumnVisitor { } if (++rowIndex_ >= numRows_) { atEnd = true; - return rows_[numRows_ - 1] - previous; + return rowAt(numRows_ - 1) - previous; } if (TFilter::deterministic && isDense) { return 0; @@ -301,7 +319,7 @@ class ColumnVisitor { if (isDense) { return 0; } - return currentRow() - rows_[rowIndex_ - 1] - 1; + return currentRow() - rowAt(rowIndex_ - 1) - 1; } FOLLY_ALWAYS_INLINE vector_size_t process(T value, bool& atEnd) { @@ -314,7 +332,7 @@ class ColumnVisitor { } if (++rowIndex_ >= numRows_) { atEnd = true; - return rows_[numRows_ - 1] - previous; + return rowAt(numRows_ - 1) - previous; } return currentRow() - previous - 1; } @@ -331,7 +349,7 @@ class ColumnVisitor { if (isDense) { return 0; } - return currentRow() - rows_[rowIndex_ - 1] - 1; + return currentRow() - rowAt(rowIndex_ - 1) - 1; } // Returns space for 'size' items of T for a scan to fill. The scan @@ -341,26 +359,30 @@ class ColumnVisitor { return reader_->mutableValues(size); } - int32_t numRows() const { - return reader_->numRows(); - } - SelectiveColumnReader& reader() const { return *reader_; } - inline vector_size_t rowAt(vector_size_t index) { + inline vector_size_t rowAt(vector_size_t index) const { if (isDense) { return index; } return rows_[index]; } - bool atEnd() { + vector_size_t rowIndex() const { + return rowIndex_; + } + + void setRowIndex(vector_size_t index) { + rowIndex_ = index; + } + + bool atEnd() const { return rowIndex_ >= numRows_; } - vector_size_t currentRow() { + vector_size_t currentRow() const { if (isDense) { return rowIndex_; } @@ -371,7 +393,7 @@ class ColumnVisitor { return rows_; } - vector_size_t numRows() { + vector_size_t numRows() const { return numRows_; } @@ -504,12 +526,6 @@ inline void ColumnVisitor::addOutputRow( reader_->addOutputRow(row); } -template -template -void ExtractToReader::addNull(vector_size_t /*rowIndex*/) { - reader->template addNull(); -} - enum FilterResult { kUnknown = 0x40, kSuccess = 0x80, kFailure = 0 }; namespace detail { @@ -1390,13 +1406,6 @@ class DirectRleColumnVisitor rows, values) {} - // Use for replacing all rows with non-null rows for fast path with - // processRun and processRle. - void setRows(folly::Range newRows) { - super::rows_ = newRows.data(); - super::numRows_ = newRows.size(); - } - // Processes 'numInput' T's in 'input'. Sets 'values' and // 'numValues'' to the resulting values. 'scatterRows' may be // non-null if there is no filter and the decoded values should be @@ -1479,4 +1488,113 @@ class DirectRleColumnVisitor } }; +template +class StringColumnReadWithVisitorHelper { + public: + StringColumnReadWithVisitorHelper(SelectiveColumnReader& reader, RowSet rows) + : reader_(reader), rows_(rows) {} + + template + auto operator()(F&& readWithVisitor) { + const bool isDense = rows_.back() == rows_.size() - 1; + if (reader_.scanSpec()->keepValues()) { + if (auto* hook = reader_.scanSpec()->valueHook()) { + if (isDense) { + readHelper( + &alwaysTrue(), + ExtractToGenericHook(hook), + std::forward(readWithVisitor)); + } else { + readHelper( + &alwaysTrue(), + ExtractToGenericHook(hook), + std::forward(readWithVisitor)); + } + } else { + if (isDense) { + processFilter( + ExtractToReader(&reader_), std::forward(readWithVisitor)); + } else { + processFilter( + ExtractToReader(&reader_), std::forward(readWithVisitor)); + } + } + } else { + if (isDense) { + processFilter(DropValues(), std::forward(readWithVisitor)); + } else { + processFilter(DropValues(), std::forward(readWithVisitor)); + } + } + } + + private: + template + void readHelper( + velox::common::Filter* filter, + ExtractValues extractValues, + F readWithVisitor) { + readWithVisitor( + ColumnVisitor( + *static_cast(filter), &reader_, rows_, extractValues)); + } + + template + void processFilter(ExtractValues extractValues, F&& readWithVisitor) { + auto* filter = reader_.scanSpec()->filter(); + if (filter == nullptr) { + readHelper( + &alwaysTrue(), extractValues, std::forward(readWithVisitor)); + return; + } + switch (filter->kind()) { + case velox::common::FilterKind::kAlwaysTrue: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kIsNull: + if constexpr (kEncodingHasNulls) { + reader_.filterNulls( + rows_, true, !std::is_same_v); + } else { + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + } + break; + case velox::common::FilterKind::kIsNotNull: + if constexpr ( + kEncodingHasNulls && std::is_same_v) { + reader_.filterNulls(rows_, false, false); + } else { + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + } + break; + case velox::common::FilterKind::kBytesRange: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kNegatedBytesRange: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kBytesValues: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kNegatedBytesValues: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + default: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + } + } + + SelectiveColumnReader& reader_; + const RowSet rows_; +}; + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/FormatData.h b/velox/dwio/common/FormatData.h index 0348604606465..1f0b5d4426bb8 100644 --- a/velox/dwio/common/FormatData.h +++ b/velox/dwio/common/FormatData.h @@ -34,7 +34,7 @@ class FormatData { template T& as() { - return *reinterpret_cast(this); + return *static_cast(this); } /// Reads nulls if the format has nulls separate from the encoded diff --git a/velox/dwio/common/OnDemandUnitLoader.cpp b/velox/dwio/common/OnDemandUnitLoader.cpp index a15998002cc8c..ee21a2b442338 100644 --- a/velox/dwio/common/OnDemandUnitLoader.cpp +++ b/velox/dwio/common/OnDemandUnitLoader.cpp @@ -58,10 +58,18 @@ class OnDemandUnitLoader : public UnitLoader { return *loadUnits_[unit]; } - void onRead( - uint32_t /* unit */, - uint64_t /* rowOffsetInUnit */, - uint64_t /* rowCount */) override {} + void onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t /* rowCount */) + override { + VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); + VELOX_CHECK_LT( + rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range"); + } + + void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) override { + VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); + VELOX_CHECK_LE( + rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range"); + } private: std::vector> loadUnits_; diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h index 64566f3a1af95..5f806f22b08fe 100644 --- a/velox/dwio/common/Options.h +++ b/velox/dwio/common/Options.h @@ -45,11 +45,7 @@ enum class FileFormat { TEXT = 5, JSON = 6, PARQUET = 7, -#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY - ALPHA = 8, -#else NIMBLE = 8, -#endif ORC = 9, }; @@ -83,6 +79,7 @@ class SerDeOptions { inline static const std::string kFieldDelim{"field.delim"}; inline static const std::string kCollectionDelim{"collection.delim"}; inline static const std::string kMapKeyDelim{"mapkey.delim"}; + inline static const std::string kEscapeChar{"escape.delim"}; explicit SerDeOptions( uint8_t fieldDelim = '\1', diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.h b/velox/dwio/common/SelectiveByteRleColumnReader.h index 06aae1c4986b7..67537ea8d8b8f 100644 --- a/velox/dwio/common/SelectiveByteRleColumnReader.h +++ b/velox/dwio/common/SelectiveByteRleColumnReader.h @@ -39,7 +39,11 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { void getValues(RowSet rows, VectorPtr* result) override; - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, ExtractValues extractValues, @@ -58,7 +62,7 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { RowSet rows, ExtractValues extractValues); - template + template void readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); }; @@ -78,7 +82,11 @@ void SelectiveByteRleColumnReader::readHelper( *reinterpret_cast(filter), this, rows, extractValues)); } -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveByteRleColumnReader::processFilter( velox::common::Filter* filter, ExtractValues extractValues, @@ -90,13 +98,20 @@ void SelectiveByteRleColumnReader::processFilter( filter, rows, extractValues); break; case FilterKind::kIsNull: - filterNulls( - rows, - true, - !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, + true, + !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -148,7 +163,7 @@ void SelectiveByteRleColumnReader::processValueHook( } } -template +template void SelectiveByteRleColumnReader::readCommon( vector_size_t offset, RowSet rows, @@ -167,17 +182,19 @@ void SelectiveByteRleColumnReader::readCommon( return; } if (isDense) { - processFilter( + processFilter( filter, dwio::common::ExtractToReader(this), rows); } else { - processFilter( + processFilter( filter, dwio::common::ExtractToReader(this), rows); } } else { if (isDense) { - processFilter(filter, dwio::common::DropValues(), rows); + processFilter( + filter, dwio::common::DropValues(), rows); } else { - processFilter(filter, dwio::common::DropValues(), rows); + processFilter( + filter, dwio::common::DropValues(), rows); } } } diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h index 4b97f4c6f4652..1fdcbc2e2a762 100644 --- a/velox/dwio/common/SelectiveColumnReader.h +++ b/velox/dwio/common/SelectiveColumnReader.h @@ -294,9 +294,8 @@ class SelectiveColumnReader { template inline void addNull() { VELOX_DCHECK_NE(valueSize_, kNoValueSize); - VELOX_DCHECK_LE( - rawResultNulls_ && rawValues_ && (numValues_ + 1) * valueSize_, - values_->capacity()); + VELOX_DCHECK(rawResultNulls_ && rawValues_); + VELOX_DCHECK_LE((numValues_ + 1) * valueSize_, values_->capacity()); anyNulls_ = true; bits::setNull(rawResultNulls_, numValues_); @@ -441,12 +440,12 @@ class SelectiveColumnReader { isFlatMapValue_ = value; } - protected: // Filters 'rows' according to 'is_null'. Only applies to cases where // scanSpec_->readsNullsOnly() is true. template void filterNulls(RowSet rows, bool isNull, bool extractValues); + protected: template void prepareRead(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); @@ -670,6 +669,8 @@ inline void SelectiveColumnReader::addValue(const folly::StringPiece value) { addStringValue(value); } +velox::common::AlwaysTrue& alwaysTrue(); + } // namespace facebook::velox::dwio::common namespace facebook::velox::dwio::common { diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h index 61bcf2b5befa9..ed38c6551fd55 100644 --- a/velox/dwio/common/SelectiveColumnReaderInternal.h +++ b/velox/dwio/common/SelectiveColumnReaderInternal.h @@ -31,8 +31,6 @@ namespace facebook::velox::dwio::common { -velox::common::AlwaysTrue& alwaysTrue(); - class Timer { public: Timer() : startClocks_{folly::hardware_timestamp()} {} diff --git a/velox/dwio/common/SelectiveFloatingPointColumnReader.h b/velox/dwio/common/SelectiveFloatingPointColumnReader.h index 61ccd7e4d8b6d..ea2455afa0c8c 100644 --- a/velox/dwio/common/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/common/SelectiveFloatingPointColumnReader.h @@ -40,7 +40,7 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader { return std::is_same_v; } - template + template void readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); @@ -57,7 +57,11 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader { void readHelper(velox::common::Filter* filter, RowSet rows, ExtractValues values); - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, RowSet rows, @@ -84,7 +88,11 @@ void SelectiveFloatingPointColumnReader::readHelper( } template -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveFloatingPointColumnReader::processFilter( velox::common::Filter* filter, RowSet rows, @@ -101,11 +109,18 @@ void SelectiveFloatingPointColumnReader::processFilter( filter, rows, extractValues); break; case velox::common::FilterKind::kIsNull: - filterNulls( - rows, true, !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, true, !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case velox::common::FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -163,7 +178,7 @@ void SelectiveFloatingPointColumnReader::processValueHook( } template -template +template void SelectiveFloatingPointColumnReader::readCommon( vector_size_t offset, RowSet rows, @@ -179,18 +194,20 @@ void SelectiveFloatingPointColumnReader::readCommon( } } else { if (isDense) { - processFilter( + processFilter( scanSpec_->filter(), rows, ExtractToReader(this)); } else { - processFilter( + processFilter( scanSpec_->filter(), rows, ExtractToReader(this)); } } } else { if (isDense) { - processFilter(scanSpec_->filter(), rows, DropValues()); + processFilter( + scanSpec_->filter(), rows, DropValues()); } else { - processFilter(scanSpec_->filter(), rows, DropValues()); + processFilter( + scanSpec_->filter(), rows, DropValues()); } } } diff --git a/velox/dwio/common/SelectiveIntegerColumnReader.h b/velox/dwio/common/SelectiveIntegerColumnReader.h index ba4b63e168ca0..444f341e06b9f 100644 --- a/velox/dwio/common/SelectiveIntegerColumnReader.h +++ b/velox/dwio/common/SelectiveIntegerColumnReader.h @@ -41,7 +41,11 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { protected: // Switches based on filter type between different readHelper instantiations. - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, ExtractValues extractValues, @@ -66,7 +70,7 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { // The common part of integer reading. calls the appropriate // instantiation of processValueHook or processFilter based on // possible value hook, filter and denseness. - template + template void readCommon(RowSet rows); }; @@ -113,7 +117,11 @@ void SelectiveIntegerColumnReader::readHelper( } } -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveIntegerColumnReader::processFilter( velox::common::Filter* filter, ExtractValues extractValues, @@ -130,11 +138,18 @@ void SelectiveIntegerColumnReader::processFilter( filter, rows, extractValues); break; case velox::common::FilterKind::kIsNull: - filterNulls( - rows, true, !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, true, !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case velox::common::FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -211,7 +226,7 @@ void SelectiveIntegerColumnReader::processValueHook( } } -template +template void SelectiveIntegerColumnReader::readCommon(RowSet rows) { bool isDense = rows.back() == rows.size() - 1; velox::common::Filter* filter = @@ -225,16 +240,20 @@ void SelectiveIntegerColumnReader::readCommon(RowSet rows) { } } else { if (isDense) { - processFilter(filter, ExtractToReader(this), rows); + processFilter( + filter, ExtractToReader(this), rows); } else { - processFilter(filter, ExtractToReader(this), rows); + processFilter( + filter, ExtractToReader(this), rows); } } } else { if (isDense) { - processFilter(filter, DropValues(), rows); + processFilter( + filter, DropValues(), rows); } else { - processFilter(filter, DropValues(), rows); + processFilter( + filter, DropValues(), rows); } } } diff --git a/velox/dwio/common/TypeWithId.cpp b/velox/dwio/common/TypeWithId.cpp index 6b803a6249da4..c4c9b4f8d36fd 100644 --- a/velox/dwio/common/TypeWithId.cpp +++ b/velox/dwio/common/TypeWithId.cpp @@ -86,4 +86,37 @@ std::unique_ptr TypeWithId::create( type, std::move(children), myId, maxId, column); } +std::string TypeWithId::fullName() const { + std::vector path; + auto* child = this; + while (child->parent_) { + switch (child->parent_->type()->kind()) { + case TypeKind::ROW: + VELOX_CHECK( + child == child->parent_->children_.at(child->column_).get()); + path.push_back( + '.' + child->parent_->type()->asRow().nameOf(child->column_)); + break; + case TypeKind::ARRAY: + break; + case TypeKind::MAP: + if (child == child->parent_->children_.at(0).get()) { + path.push_back("."); + } else { + VELOX_CHECK(child == child->children_.at(1).get()); + path.push_back("."); + } + break; + default: + VELOX_UNREACHABLE(); + } + child = parent_; + } + std::string ans = ""; + for (int i = path.size() - 1; i >= 0; --i) { + ans += path[i]; + } + return ans; +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/TypeWithId.h b/velox/dwio/common/TypeWithId.h index 39a988f9936a9..5c5fbc5d070c7 100644 --- a/velox/dwio/common/TypeWithId.h +++ b/velox/dwio/common/TypeWithId.h @@ -73,6 +73,8 @@ class TypeWithId : public velox::Tree> { return children_; } + std::string fullName() const; + private: static std::unique_ptr create( const std::shared_ptr& type, diff --git a/velox/dwio/common/UnitLoader.h b/velox/dwio/common/UnitLoader.h index f536a2d5eef16..3ea9653d521f9 100644 --- a/velox/dwio/common/UnitLoader.h +++ b/velox/dwio/common/UnitLoader.h @@ -49,8 +49,13 @@ class UnitLoader { virtual LoadUnit& getLoadedUnit(uint32_t unit) = 0; // Reader reports progress calling this method + // The call must be done **after** getLoadedUnit for unit virtual void onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t rowCount) = 0; + + // Reader reports seek calling this method. + // The call must be done **before** getLoadedUnit for the new unit + virtual void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) = 0; }; class UnitLoaderFactory { diff --git a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp index 241d91117e060..2a07b45746465 100644 --- a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp +++ b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp @@ -38,7 +38,7 @@ using RowSet = folly::Range; static const uint64_t kNumValues = 1024768 * 8; -namespace duckdb { +namespace facebook::velox::parquet { class ByteBuffer { // on to the 10 thousandth impl public: @@ -65,7 +65,7 @@ class ByteBuffer { // on to the 10 thousandth impl template T get() { available(sizeof(T)); - T val = Load((data_ptr_t)ptr); + T val = duckdb::Load((duckdb::data_ptr_t)ptr); return val; } @@ -104,7 +104,7 @@ class ParquetDecodeUtils { uint32_t count, uint8_t width) { if (width >= ParquetDecodeUtils::BITPACK_MASKS_SIZE) { - throw InvalidInputException( + throw duckdb::InvalidInputException( "The width (%d) of the bitpacked data exceeds the supported max width (%d), " "the file might be corrupted.", width, @@ -145,9 +145,9 @@ class ParquetDecodeUtils { return result; } }; -} // namespace duckdb +} // namespace facebook::velox::parquet -const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = { +const uint64_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS[] = { 0, 1, 3, @@ -214,10 +214,11 @@ const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = { 9223372036854775807, 18446744073709551615ULL}; -const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS_SIZE = - sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t); +const uint64_t + facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS_SIZE = + sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t); -const uint8_t duckdb::ParquetDecodeUtils::BITPACK_DLEN = 8; +const uint8_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_DLEN = 8; // Array of bit packed representations of randomInts_u32. The array at index i // is packed i bits wide and the values come from the low bits of @@ -316,11 +317,11 @@ void arrowBitUnpack(uint8_t bitWidth, T* result) { template void duckdbBitUnpack(uint8_t bitWidth, T* result) { - duckdb::ByteBuffer duckInputBuffer( + facebook::velox::parquet::ByteBuffer duckInputBuffer( reinterpret_cast(bitPackedData[bitWidth].data()), BYTES(kNumValues, bitWidth)); uint8_t bitpack_pos = 0; - duckdb::ParquetDecodeUtils::BitUnpack( + facebook::velox::parquet::ParquetDecodeUtils::BitUnpack( duckInputBuffer, bitpack_pos, result, kNumValues, bitWidth); } diff --git a/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp index 4775303057cbb..492b2517712be 100644 --- a/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp +++ b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp @@ -95,6 +95,72 @@ TEST(OnDemandUnitLoaderTests, LoadsCorrectlyWithNoCallback) { EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); } +TEST(OnDemandUnitLoaderTests, CanSeek) { + size_t blockedOnIoCount = 0; + OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; }); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + EXPECT_EQ(blockedOnIoCount, 0); + + EXPECT_NO_THROW(readerMock.seek(10);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 1, rows: 0-2, load(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + EXPECT_EQ(blockedOnIoCount, 1); + + EXPECT_NO_THROW(readerMock.seek(0);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 0-2, load(0), unload(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 2); + + EXPECT_NO_THROW(readerMock.seek(30);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 2, rows: 0-2, load(2), unload(0) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); + EXPECT_EQ(blockedOnIoCount, 3); + + EXPECT_NO_THROW(readerMock.seek(5);); + + EXPECT_TRUE(readerMock.read(5)); // Unit: 0, rows: 5-9, load(0), unload(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 4); +} + +TEST(OnDemandUnitLoaderTests, SeekOutOfRangeReaderError) { + size_t blockedOnIoCount = 0; + OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; }); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + EXPECT_EQ(blockedOnIoCount, 0); + readerMock.seek(59); + + readerMock.seek(60); + + EXPECT_THAT( + [&]() { readerMock.seek(61); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Can't seek to possition 61 in file. Must be up to 60.")))); +} + +TEST(OnDemandUnitLoaderTests, SeekOutOfRange) { + OnDemandUnitLoaderFactory factory(nullptr); + std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); + std::vector> units; + units.push_back(std::make_unique(10, 0, unitsLoaded, 0)); + + auto unitLoader = factory.create(std::move(units)); + + unitLoader->onSeek(0, 10); + + EXPECT_THAT( + [&]() { unitLoader->onSeek(0, 11); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Row out of range")))); +} + TEST(OnDemandUnitLoaderTests, UnitOutOfRange) { OnDemandUnitLoaderFactory factory(nullptr); std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp index 716327e301490..ed1e9417d48bf 100644 --- a/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp +++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp @@ -42,6 +42,28 @@ bool ReaderMock::read(uint64_t maxRows) { return true; } +void ReaderMock::seek(uint64_t rowNumber) { + uint64_t totalRows = 0; + uint64_t rowsLeft = rowNumber; + for (size_t unit = 0; unit < rowsPerUnit_.size(); ++unit) { + const uint64_t rowCount = rowsPerUnit_[unit]; + if (rowsLeft < rowCount) { + currentUnit_ = unit; + currentRowInUnit_ = rowsLeft; + loader_->onSeek(currentUnit_, currentRowInUnit_); + return; + } + rowsLeft -= rowCount; + totalRows += rowCount; + } + VELOX_CHECK_EQ( + rowsLeft, + 0, + "Can't seek to possition {} in file. Must be up to {}.", + rowNumber, + totalRows); +} + bool ReaderMock::loadUnit() { VELOX_CHECK(currentRowInUnit_ <= rowsPerUnit_[currentUnit_]); if (currentRowInUnit_ == rowsPerUnit_[currentUnit_]) { @@ -51,11 +73,9 @@ bool ReaderMock::loadUnit() { return false; } } - if (currentRowInUnit_ == 0) { - auto& unit = loader_->getLoadedUnit(currentUnit_); - auto& unitMock = dynamic_cast(unit); - VELOX_CHECK(unitMock.isLoaded()); - } + auto& unit = loader_->getLoadedUnit(currentUnit_); + auto& unitMock = dynamic_cast(unit); + VELOX_CHECK(unitMock.isLoaded()); return true; } diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.h b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h index e7760e75fd493..f606d7db71f15 100644 --- a/velox/dwio/common/tests/utils/UnitLoaderTestTools.h +++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h @@ -78,6 +78,8 @@ class ReaderMock { bool read(uint64_t maxRows); + void seek(uint64_t rowNumber); + std::vector unitsLoaded() const { return {unitsLoaded_.begin(), unitsLoaded_.end()}; } diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index 258c09bb7f1ba..5ca3c2b041df7 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -316,6 +316,7 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) { if (isEmptyFile()) { return 0; } + nextRowNumber_.reset(); // If we are reading only a portion of the file // (bounded by firstStripe_ and stripeCeiling_), @@ -357,6 +358,9 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) { currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_]; previousRow_ = rowNumber; + const auto loadUnitIdx = currentStripe_ - firstStripe_; + unitLoader_->onSeek(loadUnitIdx, currentRowInStripe_); + if (currentStripe_ != previousStripe) { // Different stripe. Let's load the new stripe. currentUnit_ = nullptr; @@ -583,6 +587,9 @@ void DwrfRowReader::readWithRowNumber( } int64_t DwrfRowReader::nextRowNumber() { + if (nextRowNumber_.has_value()) { + return *nextRowNumber_; + } auto strideSize = getReader().getFooter().rowIndexStride(); while (currentStripe_ < stripeCeiling_) { if (currentRowInStripe_ == 0) { @@ -601,20 +608,21 @@ int64_t DwrfRowReader::nextRowNumber() { } checkSkipStrides(strideSize); if (currentRowInStripe_ < rowsInCurrentStripe_) { - return firstRowOfStripe_[currentStripe_] + currentRowInStripe_; + nextRowNumber_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_; + return *nextRowNumber_; } advanceToNextStripe: ++currentStripe_; currentRowInStripe_ = 0; currentUnit_ = nullptr; } - atEnd_ = true; + nextRowNumber_ = kAtEnd; return kAtEnd; } int64_t DwrfRowReader::nextReadSize(uint64_t size) { VELOX_DCHECK_GT(size, 0); - if (atEnd_) { + if (nextRowNumber() == kAtEnd) { return kAtEnd; } auto rowsToRead = std::min(size, rowsInCurrentStripe_ - currentRowInStripe_); @@ -643,6 +651,7 @@ uint64_t DwrfRowReader::next( return 0; } auto rowsToRead = nextReadSize(size); + nextRowNumber_.reset(); previousRow_ = nextRow; // Record strideIndex for use by the columnReader_ which may delay actual // reading of the data. diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h index acd496e414de2..549746776fa16 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.h +++ b/velox/dwio/dwrf/reader/DwrfReader.h @@ -160,7 +160,7 @@ class DwrfRowReader : public StrideIndexProvider, dwio::common::ColumnReaderStatistics columnReaderStatistics_; - bool atEnd_{false}; + std::optional nextRowNumber_; std::unique_ptr unitLoader_; DwrfUnit* currentUnit_; diff --git a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h index 363082eccd64b..8684842c6fa27 100644 --- a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h @@ -79,7 +79,7 @@ class SelectiveByteRleColumnReader void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) override { - readCommon(offset, rows, incomingNulls); + readCommon(offset, rows, incomingNulls); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h index 5498ea77e8c52..6cce3ad3ec75f 100644 --- a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h @@ -49,7 +49,7 @@ class SelectiveFloatingPointColumnReader void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) override { using T = SelectiveFloatingPointColumnReader; - this->template readCommon(offset, rows, incomingNulls); + this->template readCommon(offset, rows, incomingNulls); this->readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp index 649319a6f9dac..eaf99cafb9374 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp @@ -105,7 +105,7 @@ void SelectiveIntegerDictionaryColumnReader::read( // lazy load dictionary only when it's needed ensureInitialized(); - readCommon(rows); + readCommon(rows); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp index 57dc53090953f..c7bd41bda3bea 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp @@ -34,7 +34,7 @@ void SelectiveIntegerDirectColumnReader::read( offset, rows, incomingNulls); - readCommon(rows); + readCommon(rows); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp index 76f17361e6a6f..adc775d2d1018 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp @@ -422,9 +422,7 @@ void SelectiveStringDirectColumnReader::readWithVisitor( int32_t current = visitor.start(); constexpr bool isExtract = std::is_same_v && - std::is_same_v< - typename TVisitor::Extract, - dwio::common::ExtractToReader>; + std::is_same_v; auto nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; if (process::hasAvx2() && isExtract) { @@ -465,73 +463,11 @@ void SelectiveStringDirectColumnReader::readWithVisitor( } } -template -void SelectiveStringDirectColumnReader::readHelper( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { - readWithVisitor( - rows, - dwio::common:: - ColumnVisitor( - *reinterpret_cast(filter), this, rows, extractValues)); -} - -template -void SelectiveStringDirectColumnReader::processFilter( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { - if (filter == nullptr) { - readHelper( - &dwio::common::alwaysTrue(), rows, extractValues); - return; - } - - switch (filter->kind()) { - case common::FilterKind::kAlwaysTrue: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kIsNull: - filterNulls( - rows, - true, - !std::is_same_v); - break; - case common::FilterKind::kIsNotNull: - if (std::is_same_v) { - filterNulls(rows, false, false); - } else { - readHelper(filter, rows, extractValues); - } - break; - case common::FilterKind::kBytesRange: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kNegatedBytesRange: - readHelper( - filter, rows, extractValues); - break; - case common::FilterKind::kBytesValues: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kNegatedBytesValues: - readHelper( - filter, rows, extractValues); - break; - default: - readHelper(filter, rows, extractValues); - break; - } -} - void SelectiveStringDirectColumnReader::read( vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); - bool isDense = rows.back() == rows.size() - 1; - auto numRows = rows.back() + 1; auto numNulls = nullsInReadRange_ ? BaseVector::countNulls(nullsInReadRange_, 0, numRows) @@ -542,38 +478,8 @@ void SelectiveStringDirectColumnReader::read( lengths_->asMutable(), numRows - numNulls); rawLengths_ = lengths_->as(); lengthIndex_ = 0; - if (scanSpec_->keepValues()) { - if (scanSpec_->valueHook()) { - if (isDense) { - readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToGenericHook(scanSpec_->valueHook())); - } else { - readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToGenericHook(scanSpec_->valueHook())); - } - } else { - if (isDense) { - processFilter( - scanSpec_->filter(), rows, dwio::common::ExtractToReader(this)); - } else { - processFilter( - scanSpec_->filter(), rows, dwio::common::ExtractToReader(this)); - } - } - } else { - if (isDense) { - processFilter( - scanSpec_->filter(), rows, dwio::common::DropValues()); - } else { - processFilter( - scanSpec_->filter(), rows, dwio::common::DropValues()); - } - } - + dwio::common::StringColumnReadWithVisitorHelper( + *this, rows)([&](auto visitor) { readWithVisitor(rows, visitor); }); readOffset_ += numRows; } diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h index 21fe4a3a25e53..cfa8a7350136b 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h @@ -66,15 +66,6 @@ class SelectiveStringDirectColumnReader template void readWithVisitor(RowSet rows, TVisitor visitor); - template - void readHelper(common::Filter* filter, RowSet rows, ExtractValues values); - - template - void processFilter( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues); - void extractCrossBuffers( const int32_t* lengths, const int32_t* starts, diff --git a/velox/dwio/dwrf/test/CacheInputTest.cpp b/velox/dwio/dwrf/test/CacheInputTest.cpp index 7830de27502d1..dd15964972e2c 100644 --- a/velox/dwio/dwrf/test/CacheInputTest.cpp +++ b/velox/dwio/dwrf/test/CacheInputTest.cpp @@ -201,8 +201,15 @@ class CacheTest : public testing::Test { (1 << 20) - 11, (streamStarts_[streamIndex + 1] - streamStarts_[streamIndex]) / 2)}; - data->streams.push_back( - data->input->enqueue(region, streamIds_[streamIndex].get())); + auto stream = data->input->enqueue(region, streamIds_[streamIndex].get()); + if (cache_->ssdCache()) { + auto name = static_cast(*stream).getName(); + EXPECT_TRUE( + name.find("ssdFile=" + cache_->ssdCache()->filePrefix()) != + name.npos) + << name; + } + data->streams.push_back(std::move(stream)); data->regions.push_back(region); } return data; @@ -424,6 +431,7 @@ TEST_F(CacheTest, window) { auto stream = input->read(begin, end - begin, LogType::TEST); auto cacheInput = dynamic_cast(stream.get()); EXPECT_TRUE(cacheInput != nullptr); + ASSERT_EQ(cacheInput->getName(), "CacheInputStream 0 of 13631488"); auto maxSize = allocator_->sizeClasses().back() * memory::AllocationTraits::kPageSize; const void* buffer; @@ -501,8 +509,6 @@ TEST_F(CacheTest, ssd) { readFiles( "prefix1_", 0, kSsdBytes / bytesPerFile, 30, 100, 1, kStripesPerFile, 4); - LOG(INFO) << cache_->toString(); - waitForWrite(); cache_->clear(); // Read double this to get some eviction from SSD. @@ -523,7 +529,6 @@ TEST_F(CacheTest, ssd) { // issued. Also, the head of each file does not get prefetched // because each file has its own tracker. EXPECT_LE(kSsdBytes / 8, ioStats_->prefetch().sum()); - LOG(INFO) << cache_->toString(); readFiles( "prefix1_", @@ -534,7 +539,6 @@ TEST_F(CacheTest, ssd) { 1, kStripesPerFile, 4); - LOG(INFO) << cache_->toString(); } TEST_F(CacheTest, singleFileThreads) { diff --git a/velox/dwio/parquet/reader/BooleanColumnReader.h b/velox/dwio/parquet/reader/BooleanColumnReader.h index 41d3405abd548..73126f4679888 100644 --- a/velox/dwio/parquet/reader/BooleanColumnReader.h +++ b/velox/dwio/parquet/reader/BooleanColumnReader.h @@ -49,7 +49,7 @@ class BooleanColumnReader : public dwio::common::SelectiveByteRleColumnReader { void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) override { - readCommon(offset, rows, incomingNulls); + readCommon(offset, rows, incomingNulls); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/parquet/reader/FloatingPointColumnReader.h b/velox/dwio/parquet/reader/FloatingPointColumnReader.h index be4c2cd843631..ed91e67a739ff 100644 --- a/velox/dwio/parquet/reader/FloatingPointColumnReader.h +++ b/velox/dwio/parquet/reader/FloatingPointColumnReader.h @@ -48,7 +48,7 @@ class FloatingPointColumnReader void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) override { using T = FloatingPointColumnReader; - this->template readCommon(offset, rows, incomingNulls); + this->template readCommon(offset, rows, incomingNulls); this->readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/parquet/reader/IntegerColumnReader.h b/velox/dwio/parquet/reader/IntegerColumnReader.h index 59a9fc12bf919..d7b458c739534 100644 --- a/velox/dwio/parquet/reader/IntegerColumnReader.h +++ b/velox/dwio/parquet/reader/IntegerColumnReader.h @@ -75,7 +75,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader { offset, rows, nullptr); - readCommon(rows); + readCommon(rows); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/parquet/reader/StringColumnReader.cpp b/velox/dwio/parquet/reader/StringColumnReader.cpp index 334c3c02a7e20..2dd0250159c3e 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.cpp +++ b/velox/dwio/parquet/reader/StringColumnReader.cpp @@ -31,104 +31,15 @@ uint64_t StringColumnReader::skip(uint64_t numValues) { return numValues; } -template -void StringColumnReader::readHelper( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { - formatData_->as().readWithVisitor( - dwio::common:: - ColumnVisitor( - *reinterpret_cast(filter), this, rows, extractValues)); -} - -template -void StringColumnReader::processFilter( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { - if (filter == nullptr) { - readHelper( - &dwio::common::alwaysTrue(), rows, extractValues); - return; - } - - switch (filter->kind()) { - case common::FilterKind::kAlwaysTrue: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kIsNull: - filterNulls( - rows, - true, - !std::is_same:: - value); - break; - case common::FilterKind::kIsNotNull: - if (std::is_same:: - value) { - filterNulls(rows, false, false); - } else { - readHelper(filter, rows, extractValues); - } - break; - case common::FilterKind::kBytesRange: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kNegatedBytesRange: - readHelper( - filter, rows, extractValues); - break; - case common::FilterKind::kBytesValues: - readHelper(filter, rows, extractValues); - break; - case common::FilterKind::kNegatedBytesValues: - readHelper( - filter, rows, extractValues); - break; - default: - readHelper(filter, rows, extractValues); - break; - } -} - void StringColumnReader::read( vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); - bool isDense = rows.back() == rows.size() - 1; - if (scanSpec_->keepValues()) { - if (scanSpec_->valueHook()) { - if (isDense) { - readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToGenericHook(scanSpec_->valueHook())); - } else { - readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToGenericHook(scanSpec_->valueHook())); - } - return; - } - if (isDense) { - processFilter( - scanSpec_->filter(), rows, dwio::common::ExtractToReader(this)); - } else { - processFilter( - scanSpec_->filter(), rows, dwio::common::ExtractToReader(this)); - } - } else { - if (isDense) { - processFilter( - scanSpec_->filter(), rows, dwio::common::DropValues()); - } else { - processFilter( - scanSpec_->filter(), rows, dwio::common::DropValues()); - } - } + dwio::common::StringColumnReadWithVisitorHelper( + *this, rows)([&](auto visitor) { + formatData_->as().readWithVisitor(visitor); + }); readOffset_ += rows.back() + 1; } diff --git a/velox/dwio/parquet/reader/StringColumnReader.h b/velox/dwio/parquet/reader/StringColumnReader.h index 23269fda84462..e9bedc2365fc8 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.h +++ b/velox/dwio/parquet/reader/StringColumnReader.h @@ -49,27 +49,6 @@ class StringColumnReader : public dwio::common::SelectiveColumnReader { void getValues(RowSet rows, VectorPtr* result) override; void dedictionarize() override; - - private: - template - void skipInDecode(int32_t numValues, int32_t current, const uint64_t* nulls); - - folly::StringPiece readValue(int32_t length); - - template - void decode(const uint64_t* nulls, Visitor visitor); - - template - void readWithVisitor(RowSet rows, TVisitor visitor); - - template - void readHelper(common::Filter* filter, RowSet rows, ExtractValues values); - - template - void processFilter( - common::Filter* filter, - RowSet rows, - ExtractValues extractValues); }; } // namespace facebook::velox::parquet diff --git a/velox/exec/Driver.cpp b/velox/exec/Driver.cpp index f6e466cb5a5ab..fa732e12cba09 100644 --- a/velox/exec/Driver.cpp +++ b/velox/exec/Driver.cpp @@ -110,6 +110,19 @@ inline void checkIsBlockFutureValid( op->operatorType()); } +// Used to generate context for exceptions that are thrown while executing an +// operator. Eg output: 'Operator: FilterProject(1) PlanNodeId: 1 TaskId: +// test_cursor 1 PipelineId: 0 DriverId: 0 OperatorAddress: 0x61a000003c80' +std::string addContextOnException( + VeloxException::Type exceptionType, + void* arg) { + if (exceptionType != VeloxException::Type::kSystem) { + return ""; + } + auto* op = static_cast(arg); + return fmt::format("Operator: {}", op->toString()); +} + } // namespace DriverCtx::DriverCtx( @@ -293,11 +306,12 @@ void Driver::initializeOperators() { } void Driver::pushdownFilters(int operatorIndex) { - auto op = operators_[operatorIndex].get(); + auto* op = operators_[operatorIndex].get(); const auto& filters = op->getDynamicFilters(); if (filters.empty()) { return; } + const auto& planNodeId = op->planNodeId(); op->addRuntimeStat("dynamicFiltersProduced", RuntimeCounter(filters.size())); @@ -313,7 +327,7 @@ void Driver::pushdownFilters(int operatorIndex) { prevOp->canAddDynamicFilter(), "Cannot push down dynamic filters produced by {}", op->toString()); - prevOp->addDynamicFilter(channel, entry.second); + prevOp->addDynamicFilter(planNodeId, channel, entry.second); prevOp->addRuntimeStat("dynamicFiltersAccepted", RuntimeCounter(1)); break; } @@ -327,7 +341,7 @@ void Driver::pushdownFilters(int operatorIndex) { prevOp->canAddDynamicFilter(), "Cannot push down dynamic filters produced by {}", op->toString()); - prevOp->addDynamicFilter(channel, entry.second); + prevOp->addDynamicFilter(planNodeId, channel, entry.second); prevOp->addRuntimeStat("dynamicFiltersAccepted", RuntimeCounter(1)); break; } @@ -374,10 +388,12 @@ void Driver::enqueueInternal() { RuntimeStatWriterScopeGuard statsWriterGuard(operatorPtr); \ threadNumVeloxThrow() = 0; \ opCallStatus_.start(operatorId, operatorMethod); \ + ExceptionContextSetter exceptionContext( \ + {addContextOnException, operatorPtr, true}); \ auto stopGuard = folly::makeGuard([&]() { opCallStatus_.stop(); }); \ call; \ recordSilentThrows(*operatorPtr); \ - } catch (const VeloxException& e) { \ + } catch (const VeloxException&) { \ throw; \ } catch (const std::exception& e) { \ VELOX_FAIL( \ diff --git a/velox/exec/ExchangeClient.cpp b/velox/exec/ExchangeClient.cpp index 0ca18e1fcf700..65eab7842e675 100644 --- a/velox/exec/ExchangeClient.cpp +++ b/velox/exec/ExchangeClient.cpp @@ -117,6 +117,11 @@ ExchangeClient::next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) { std::vector> pages; { std::lock_guard l(queue_->mutex()); + if (closed_) { + *atEnd = true; + return pages; + } + *atEnd = false; pages = queue_->dequeueLocked(maxBytes, atEnd, future); if (*atEnd) { diff --git a/velox/exec/HashProbe.cpp b/velox/exec/HashProbe.cpp index 318ff7b1df2da..3ff7e6c5d2e53 100644 --- a/velox/exec/HashProbe.cpp +++ b/velox/exec/HashProbe.cpp @@ -478,6 +478,7 @@ void HashProbe::prepareInputIndicesBuffers( VELOX_DCHECK(spillEnabled()); const auto maxIndicesBufferBytes = numInput * sizeof(vector_size_t); if (nonSpillInputIndicesBuffer_ == nullptr || + !nonSpillInputIndicesBuffer_->isMutable() || nonSpillInputIndicesBuffer_->size() < maxIndicesBufferBytes) { nonSpillInputIndicesBuffer_ = allocateIndices(numInput, pool()); rawNonSpillInputIndicesBuffer_ = @@ -1052,7 +1053,7 @@ bool HashProbe::maybeReadSpillOutput() { return true; } -void HashProbe::fillFilterInput(vector_size_t size) { +RowVectorPtr HashProbe::createFilterInput(vector_size_t size) { std::vector filterColumns(filterInputType_->size()); for (auto projection : filterInputProjections_) { ensureLoadedIfNotAtEnd(projection.inputChannel); @@ -1068,11 +1069,12 @@ void HashProbe::fillFilterInput(vector_size_t size) { filterInputType_->children(), filterColumns); - filterInput_ = std::make_shared( + return std::make_shared( pool(), filterInputType_, nullptr, size, std::move(filterColumns)); } void HashProbe::prepareFilterRowsForNullAwareJoin( + RowVectorPtr& filterInput, vector_size_t numRows, bool filterPropagateNulls) { VELOX_CHECK_LE(numRows, kBatchSize); @@ -1086,7 +1088,7 @@ void HashProbe::prepareFilterRowsForNullAwareJoin( auto* rawNullRows = nullFilterInputRows_.asMutableRange().bits(); for (auto& projection : filterInputProjections_) { filterInputColumnDecodedVector_.decode( - *filterInput_->childAt(projection.outputChannel), filterInputRows_); + *filterInput->childAt(projection.outputChannel), filterInputRows_); if (filterInputColumnDecodedVector_.mayHaveNulls()) { SelectivityVector nullsInActiveRows(numRows); memcpy( @@ -1285,13 +1287,14 @@ int32_t HashProbe::evalFilter(int32_t numRows) { filterInputRows_.updateBounds(); } - fillFilterInput(numRows); + RowVectorPtr filterInput = createFilterInput(numRows); if (nullAware_) { - prepareFilterRowsForNullAwareJoin(numRows, filterPropagateNulls); + prepareFilterRowsForNullAwareJoin( + filterInput, numRows, filterPropagateNulls); } - EvalCtx evalCtx(operatorCtx_->execCtx(), filter_.get(), filterInput_.get()); + EvalCtx evalCtx(operatorCtx_->execCtx(), filter_.get(), filterInput.get()); filter_->eval(0, 1, true, filterInputRows_, evalCtx, filterResult_); decodedFilterResult_.decode(*filterResult_[0], filterInputRows_); @@ -1668,7 +1671,7 @@ void HashProbe::spillOutput(const std::vector& operators) { // this runs. try { spillTask->move(); - } catch (const std::exception& e) { + } catch (const std::exception&) { } } }); @@ -1770,7 +1773,7 @@ SpillPartitionSet HashProbe::spillTable() { // this runs. try { spillTask->move(); - } catch (const std::exception& e) { + } catch (const std::exception&) { } } }); diff --git a/velox/exec/HashProbe.h b/velox/exec/HashProbe.h index 997847fbb057d..3b9af4fd3d8e8 100644 --- a/velox/exec/HashProbe.h +++ b/velox/exec/HashProbe.h @@ -128,14 +128,17 @@ class HashProbe : public Operator { decodedFilterResult_.valueAt(row); } - // Populate filter input columns. - void fillFilterInput(vector_size_t size); + // Create a temporary input vector to be passed to the filter. This ensures it + // gets destroyed in case its wrapping an unloaded vector which eventually + // needs to be wrapped in fillOutput(). + RowVectorPtr createFilterInput(vector_size_t size); // Prepare filter row selectivity for null-aware join. 'numRows' // specifies the number of rows in 'filterInputRows_' to process. If // 'filterPropagateNulls' is true, the probe input row which has null in any // probe filter column can't pass the filter. void prepareFilterRowsForNullAwareJoin( + RowVectorPtr& filterInput, vector_size_t numRows, bool filterPropagateNulls); @@ -372,7 +375,7 @@ class HashProbe : public Operator { // side. Used by right semi project join. bool probeSideHasNullKeys_{false}; - // Rows in 'filterInput_' to apply 'filter_' to. + // Rows in the filter columns to apply 'filter_' to. SelectivityVector filterInputRows_; // Join filter. @@ -390,11 +393,6 @@ class HashProbe : public Operator { // Maps from column index in hash table to channel in 'filterInputType_'. std::vector filterTableProjections_; - // Temporary projection from probe and build for evaluating - // 'filter_'. This can always be reused since this does not escape - // this operator. - RowVectorPtr filterInput_; - // The following six fields are used in null-aware anti join filter // processing. diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp index 45c83098b561c..18483e565414a 100644 --- a/velox/exec/Operator.cpp +++ b/velox/exec/Operator.cpp @@ -395,9 +395,10 @@ std::string Operator::toString() const { std::stringstream out; if (auto task = operatorCtx_->task()) { auto driverCtx = operatorCtx_->driverCtx(); - out << operatorType() << "(" << operatorId() << ")<" << task->taskId() - << ":" << driverCtx->pipelineId << "." << driverCtx->driverId << " " - << this; + out << operatorType() << "(" << operatorId() << ")" + << " PlanNodeId: " << planNodeId() << " TaskId: " << task->taskId() + << " PipelineId: " << driverCtx->pipelineId + << " DriverId: " << driverCtx->driverId << " OperatorAddress: " << this; } else { out << ""; } @@ -504,6 +505,8 @@ void OperatorStats::add(const OperatorStats& other) { spilledFiles += other.spilledFiles; numNullKeys += other.numNullKeys; + + dynamicFilterStats.add(other.dynamicFilterStats); } void OperatorStats::clear() { @@ -537,6 +540,8 @@ void OperatorStats::clear() { spilledRows = 0; spilledPartitions = 0; spilledFiles = 0; + + dynamicFilterStats.clear(); } std::unique_ptr Operator::MemoryReclaimer::create( diff --git a/velox/exec/Operator.h b/velox/exec/Operator.h index c48862c60b63a..3519b2f2d6fbe 100644 --- a/velox/exec/Operator.h +++ b/velox/exec/Operator.h @@ -82,6 +82,26 @@ struct MemoryStats { } }; +/// Records the dynamic filter stats of an operator. +struct DynamicFilterStats { + /// The set of plan node ids that produce the dynamic filter added to an + /// operator. If it is empty, then there is no dynamic filter added. + std::unordered_set producerNodeIds; + + void clear() { + producerNodeIds.clear(); + } + + void add(const DynamicFilterStats& other) { + producerNodeIds.insert( + other.producerNodeIds.begin(), other.producerNodeIds.end()); + } + + bool empty() const { + return producerNodeIds.empty(); + } +}; + struct OperatorStats { /// Initial ordinal position in the operator's pipeline. int32_t operatorId = 0; @@ -106,6 +126,9 @@ struct OperatorStats { uint64_t inputBytes = 0; uint64_t inputPositions = 0; + /// Contains the dynamic filters stats if applied. + DynamicFilterStats dynamicFilterStats; + /// Number of input batches / vectors. Allows to compute an average batch /// size. uint64_t inputVectors = 0; @@ -161,7 +184,7 @@ struct OperatorStats { int numDrivers = 0; - OperatorStats() {} + OperatorStats() = default; OperatorStats( int32_t _operatorId, @@ -422,6 +445,7 @@ class Operator : public BaseRuntimeStatWriter { /// Adds a filter dynamically generated by a downstream operator. Called only /// if canAddFilter() returns true. virtual void addDynamicFilter( + const core::PlanNodeId& /*producer*/, column_index_t /*outputChannel*/, const std::shared_ptr& /*filter*/) { VELOX_UNSUPPORTED( diff --git a/velox/exec/PlanNodeStats.cpp b/velox/exec/PlanNodeStats.cpp index c536f31b00b32..caab8f7648728 100644 --- a/velox/exec/PlanNodeStats.cpp +++ b/velox/exec/PlanNodeStats.cpp @@ -39,6 +39,8 @@ void PlanNodeStats::addTotals(const OperatorStats& stats) { rawInputRows += stats.rawInputPositions; rawInputBytes += stats.rawInputBytes; + dynamicFilterStats.add(stats.dynamicFilterStats); + outputRows += stats.outputPositions; outputBytes += stats.outputBytes; outputVectors += stats.outputVectors; @@ -112,6 +114,11 @@ std::string PlanNodeStats::toString(bool includeInputStats) const { << succinctBytes(spilledBytes) << ", " << spilledFiles << " files)"; } + if (!dynamicFilterStats.empty()) { + out << ", DynamicFilter producer plan nodes: " + << folly::join(',', dynamicFilterStats.producerNodeIds); + } + return out.str(); } diff --git a/velox/exec/PlanNodeStats.h b/velox/exec/PlanNodeStats.h index a53a7fa3ea007..4d10b9d608759 100644 --- a/velox/exec/PlanNodeStats.h +++ b/velox/exec/PlanNodeStats.h @@ -62,6 +62,9 @@ struct PlanNodeStats { /// Sum of raw input bytes for all corresponding operators. uint64_t rawInputBytes{0}; + /// Contains the dynamic filters stats if applied. + DynamicFilterStats dynamicFilterStats; + /// Sum of output rows for all corresponding operators. When /// plan node corresponds to multiple operator types, operators of only one of /// these types report non-zero output rows. diff --git a/velox/exec/TableScan.cpp b/velox/exec/TableScan.cpp index 1b4674cf50018..f1c24d97e6c48 100644 --- a/velox/exec/TableScan.cpp +++ b/velox/exec/TableScan.cpp @@ -356,12 +356,14 @@ bool TableScan::isFinished() { } void TableScan::addDynamicFilter( + const core::PlanNodeId& producer, column_index_t outputChannel, const std::shared_ptr& filter) { if (dataSource_) { dataSource_->addDynamicFilter(outputChannel, filter); } dynamicFilters_.emplace(outputChannel, filter); + stats_.wlock()->dynamicFilterStats.producerNodeIds.emplace(producer); } } // namespace facebook::velox::exec diff --git a/velox/exec/TableScan.h b/velox/exec/TableScan.h index 973821a3a45c8..516e377516ba3 100644 --- a/velox/exec/TableScan.h +++ b/velox/exec/TableScan.h @@ -46,6 +46,7 @@ class TableScan : public SourceOperator { } void addDynamicFilter( + const core::PlanNodeId& producer, column_index_t outputChannel, const std::shared_ptr& filter) override; diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index c0ac2f53283ce..01fdd52266bff 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -259,6 +259,7 @@ std::shared_ptr Task::create( std::move(onError)); } +// static std::shared_ptr Task::create( const std::string& taskId, core::PlanFragment planFragment, @@ -279,42 +280,6 @@ std::shared_ptr Task::create( return task; } -std::shared_ptr Task::create( - const std::string& taskId, - core::PlanFragment planFragment, - int destination, - std::shared_ptr queryCtx, - Consumer consumer, - std::function onError) { - return Task::create( - taskId, - std::move(planFragment), - destination, - std::move(queryCtx), - (consumer ? [c = std::move(consumer)]() { return c; } - : ConsumerSupplier{}), - std::move(onError)); -} - -std::shared_ptr Task::create( - const std::string& taskId, - core::PlanFragment planFragment, - int destination, - std::shared_ptr queryCtx, - ConsumerSupplier consumerSupplier, - std::function onError) { - auto task = std::shared_ptr(new Task( - taskId, - std::move(planFragment), - destination, - std::move(queryCtx), - Task::ExecutionMode::kParallel, - std::move(consumerSupplier), - std::move(onError))); - task->initTaskPool(); - return task; -} - Task::Task( const std::string& taskId, core::PlanFragment planFragment, diff --git a/velox/exec/Task.h b/velox/exec/Task.h index 09716455d18a9..3945103a0f326 100644 --- a/velox/exec/Task.h +++ b/velox/exec/Task.h @@ -84,24 +84,6 @@ class Task : public std::enable_shared_from_this { ConsumerSupplier consumerSupplier, std::function onError = nullptr); - /// TODO: Delete following two overloads once all callers are migrated to the - /// above ones - static std::shared_ptr create( - const std::string& taskId, - core::PlanFragment planFragment, - int destination, - std::shared_ptr queryCtx, - Consumer consumer = nullptr, - std::function onError = nullptr); - - static std::shared_ptr create( - const std::string& taskId, - core::PlanFragment planFragment, - int destination, - std::shared_ptr queryCtx, - ConsumerSupplier consumerSupplier, - std::function onError = nullptr); - ~Task(); /// Specify directory to which data will be spilled if spilling is enabled and diff --git a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp index ce030bff359f4..0f2f9c0d76af0 100644 --- a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp +++ b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp @@ -197,6 +197,9 @@ struct HashTableBenchmarkResult { class HashTableListJoinResultBenchmark : public VectorTestBase { public: + HashTableListJoinResultBenchmark() + : randomEngine_((std::random_device{}())) {} + HashTableBenchmarkResult run(HashTableBenchmarkParams params) { params_ = params; HashTableBenchmarkResult result; @@ -260,7 +263,8 @@ class HashTableListJoinResultBenchmark : public VectorTestBase { if (addExtraValue) { data[0] = params_.extraValue; } - std::random_shuffle(data.begin(), data.end()); + + std::shuffle(data.begin(), data.end(), randomEngine_); std::vector children; children.push_back(makeFlatVector(data)); for (int32_t i = 0; i < params_.numDependentFields; ++i) { @@ -462,6 +466,7 @@ class HashTableListJoinResultBenchmark : public VectorTestBase { eraseTime_ += eraseClock.timeToDropValue(); } + std::default_random_engine randomEngine_; std::unique_ptr> topTable_; HashTableBenchmarkParams params_; diff --git a/velox/exec/fuzzer/AggregationFuzzer.cpp b/velox/exec/fuzzer/AggregationFuzzer.cpp index e6d15548e62c3..cde7098bc1250 100644 --- a/velox/exec/fuzzer/AggregationFuzzer.cpp +++ b/velox/exec/fuzzer/AggregationFuzzer.cpp @@ -25,7 +25,7 @@ #include "velox/exec/PartitionFunction.h" #include "velox/exec/fuzzer/AggregationFuzzerBase.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/vector/VectorSaver.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -39,8 +39,8 @@ DEFINE_bool( false, "When true, the results of the window aggregation are compared to reference DB results"); -using facebook::velox::test::CallableSignature; -using facebook::velox::test::SignatureTemplate; +using facebook::velox::fuzzer::CallableSignature; +using facebook::velox::fuzzer::SignatureTemplate; namespace facebook::velox::exec::test { @@ -153,7 +153,7 @@ class AggregationFuzzer : public AggregationFuzzerBase { const std::vector& plans, bool customVerification, const std::vector>& customVerifiers, - const velox::test::ResultOrError& expected, + const velox::fuzzer::ResultOrError& expected, int32_t maxDrivers = 2, bool testWithSpilling = true) { for (auto i = 0; i < plans.size(); ++i) { @@ -1074,7 +1074,7 @@ bool AggregationFuzzer::compareEquivalentPlanResults( stats_.updateReferenceQueryStats(referenceResult.second); if (referenceResult.first) { - velox::test::ResultOrError expected; + velox::fuzzer::ResultOrError expected; expected.result = mergeRowVectors(referenceResult.first.value(), pool_.get()); diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index 462ecf9932ba8..6f6e42dd7bb69 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -25,7 +25,7 @@ #include "velox/exec/fuzzer/PrestoQueryRunner.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" #include "velox/expression/SignatureBinder.h" -#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h" +#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h" #include "velox/vector/VectorSaver.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -198,7 +198,7 @@ AggregationFuzzerBase::pickSignature() { const auto& signatureTemplate = signatureTemplates_[idx - signatures_.size()]; signature.name = signatureTemplate.name; - velox::test::ArgumentTypeFuzzer typeFuzzer( + velox::fuzzer::ArgumentTypeFuzzer typeFuzzer( *signatureTemplate.signature, rng_); VELOX_CHECK(typeFuzzer.fuzzArgumentTypes(FLAGS_max_num_varargs)); signature.args = typeFuzzer.argumentTypes(); @@ -386,7 +386,7 @@ void AggregationFuzzerBase::printSignatureStats() { } } -velox::test::ResultOrError AggregationFuzzerBase::execute( +velox::fuzzer::ResultOrError AggregationFuzzerBase::execute( const core::PlanNodePtr& plan, const std::vector& splits, bool injectSpill, @@ -395,7 +395,7 @@ velox::test::ResultOrError AggregationFuzzerBase::execute( LOG(INFO) << "Executing query plan: " << std::endl << plan->toString(true, true); - velox::test::ResultOrError resultOrError; + velox::fuzzer::ResultOrError resultOrError; try { std::shared_ptr spillDirectory; AssertQueryBuilder builder(plan); @@ -511,7 +511,7 @@ void AggregationFuzzerBase::testPlan( bool abandonPartial, bool customVerification, const std::vector>& customVerifiers, - const velox::test::ResultOrError& expected, + const velox::fuzzer::ResultOrError& expected, int32_t maxDrivers) { auto actual = execute( planWithSplits.plan, @@ -523,10 +523,10 @@ void AggregationFuzzerBase::testPlan( } void AggregationFuzzerBase::compare( - const velox::test::ResultOrError& actual, + const velox::fuzzer::ResultOrError& actual, bool customVerification, const std::vector>& customVerifiers, - const velox::test::ResultOrError& expected) { + const velox::fuzzer::ResultOrError& expected) { // Compare results or exceptions (if any). Fail is anything is different. if (FLAGS_enable_oom_injection) { // If OOM injection is enabled and we've made it this far and the test @@ -537,7 +537,8 @@ void AggregationFuzzerBase::compare( // Compare results or exceptions (if any). Fail if anything is different. if (expected.exceptionPtr || actual.exceptionPtr) { // Throws in case exceptions are not compatible. - velox::test::compareExceptions(expected.exceptionPtr, actual.exceptionPtr); + velox::fuzzer::compareExceptions( + expected.exceptionPtr, actual.exceptionPtr); return; } diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index 9b9c0530dcaa1..f2f9b6fb632fd 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -23,7 +23,7 @@ #include "velox/exec/fuzzer/ReferenceQueryRunner.h" #include "velox/exec/fuzzer/ResultVerifier.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorMaker.h" @@ -47,8 +47,8 @@ DECLARE_bool(log_signature_stats); namespace facebook::velox::exec::test { -using facebook::velox::test::CallableSignature; -using facebook::velox::test::SignatureTemplate; +using facebook::velox::fuzzer::CallableSignature; +using facebook::velox::fuzzer::SignatureTemplate; constexpr const std::string_view kPlanNodeFileName = "plan_nodes"; @@ -219,7 +219,7 @@ class AggregationFuzzerBase { const core::PlanNodePtr& plan, const std::vector& input); - velox::test::ResultOrError execute( + velox::fuzzer::ResultOrError execute( const core::PlanNodePtr& plan, const std::vector& splits = {}, bool injectSpill = false, @@ -236,10 +236,10 @@ class AggregationFuzzerBase { const std::vector& input); void compare( - const velox::test::ResultOrError& actual, + const velox::fuzzer::ResultOrError& actual, bool customVerification, const std::vector>& customVerifiers, - const velox::test::ResultOrError& expected); + const velox::fuzzer::ResultOrError& expected); /// Returns false if the type or its children are unsupported. /// Currently returns false if type is Date,IntervalDayTime or Unknown. @@ -258,7 +258,7 @@ class AggregationFuzzerBase { bool abandonPartial, bool customVerification, const std::vector>& customVerifiers, - const velox::test::ResultOrError& expected, + const velox::fuzzer::ResultOrError& expected, int32_t maxDrivers = 2); void printSignatureStats(); diff --git a/velox/exec/fuzzer/AggregationFuzzerRunner.h b/velox/exec/fuzzer/AggregationFuzzerRunner.h index 0697576551992..f82a2120189cb 100644 --- a/velox/exec/fuzzer/AggregationFuzzerRunner.h +++ b/velox/exec/fuzzer/AggregationFuzzerRunner.h @@ -94,7 +94,7 @@ class AggregationFuzzerRunner { exit(1); } - auto filteredSignatures = velox::test::filterSignatures( + auto filteredSignatures = velox::fuzzer::filterSignatures( signatures, options.onlyFunctions, options.skipFunctions); if (filteredSignatures.empty()) { LOG(ERROR) diff --git a/velox/exec/fuzzer/CMakeLists.txt b/velox/exec/fuzzer/CMakeLists.txt index 96ca34cb9cc4e..a169100a6bdf1 100644 --- a/velox/exec/fuzzer/CMakeLists.txt +++ b/velox/exec/fuzzer/CMakeLists.txt @@ -57,3 +57,9 @@ target_link_libraries( velox_expression_test_utility velox_aggregation_fuzzer_base velox_temp_path) + +add_library(velox_row_number_fuzzer RowNumberFuzzer.cpp) + +target_link_libraries( + velox_row_number_fuzzer velox_fuzzer_util velox_type velox_vector_fuzzer + velox_exec_test_lib velox_expression_test_utility) diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp index d926addfd9211..e19b1d33a7b08 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.cpp +++ b/velox/exec/fuzzer/DuckQueryRunner.cpp @@ -133,21 +133,26 @@ std::optional DuckQueryRunner::toSql( } } - if (auto projectNode = + if (const auto projectNode = std::dynamic_pointer_cast(plan)) { return toSql(projectNode); } - if (auto windowNode = + if (const auto windowNode = std::dynamic_pointer_cast(plan)) { return toSql(windowNode); } - if (auto aggregationNode = + if (const auto aggregationNode = std::dynamic_pointer_cast(plan)) { return toSql(aggregationNode); } + if (const auto rowNumberNode = + std::dynamic_pointer_cast(plan)) { + return toSql(rowNumberNode); + } + VELOX_NYI(); } @@ -297,4 +302,31 @@ std::optional DuckQueryRunner::toSql( return sql.str(); } + +std::optional DuckQueryRunner::toSql( + const std::shared_ptr& rowNumberNode) { + std::stringstream sql; + sql << "SELECT "; + + const auto& inputType = rowNumberNode->sources()[0]->outputType(); + for (auto i = 0; i < inputType->size(); ++i) { + appendComma(i, sql); + sql << inputType->nameOf(i); + } + + sql << ", row_number() OVER ("; + + const auto& partitionKeys = rowNumberNode->partitionKeys(); + if (!partitionKeys.empty()) { + sql << "partition by "; + for (auto i = 0; i < partitionKeys.size(); ++i) { + appendComma(i, sql); + sql << partitionKeys[i]->name(); + } + } + + sql << ") as row_number FROM tmp"; + + return sql.str(); +} } // namespace facebook::velox::exec::test diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h index a683652946a4d..a5dc3f785716a 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.h +++ b/velox/exec/fuzzer/DuckQueryRunner.h @@ -49,6 +49,9 @@ class DuckQueryRunner : public ReferenceQueryRunner { std::optional toSql( const std::shared_ptr& projectNode); + std::optional toSql( + const std::shared_ptr& rowNumberNode); + std::unordered_set aggregateFunctionNames_; }; diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index bc81a452ae5e5..7831bf350a7a5 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -159,21 +159,26 @@ PrestoQueryRunner::PrestoQueryRunner( std::optional PrestoQueryRunner::toSql( const core::PlanNodePtr& plan) { - if (auto projectNode = + if (const auto projectNode = std::dynamic_pointer_cast(plan)) { return toSql(projectNode); } - if (auto windowNode = + if (const auto windowNode = std::dynamic_pointer_cast(plan)) { return toSql(windowNode); } - if (auto aggregationNode = + if (const auto aggregationNode = std::dynamic_pointer_cast(plan)) { return toSql(aggregationNode); } + if (const auto rowNumberNode = + std::dynamic_pointer_cast(plan)) { + return toSql(rowNumberNode); + } + VELOX_NYI(); } @@ -500,6 +505,37 @@ std::optional PrestoQueryRunner::toSql( return sql.str(); } +std::optional PrestoQueryRunner::toSql( + const std::shared_ptr& rowNumberNode) { + if (!isSupportedDwrfType(rowNumberNode->sources()[0]->outputType())) { + return std::nullopt; + } + + std::stringstream sql; + sql << "SELECT "; + + const auto& inputType = rowNumberNode->sources()[0]->outputType(); + for (auto i = 0; i < inputType->size(); ++i) { + appendComma(i, sql); + sql << inputType->nameOf(i); + } + + sql << ", row_number() OVER ("; + + const auto& partitionKeys = rowNumberNode->partitionKeys(); + if (!partitionKeys.empty()) { + sql << "partition by "; + for (auto i = 0; i < partitionKeys.size(); ++i) { + appendComma(i, sql); + sql << partitionKeys[i]->name(); + } + } + + sql << ") as row_number FROM tmp"; + + return sql.str(); +} + std::multiset> PrestoQueryRunner::execute( const std::string& sql, const std::vector& input, diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h index dfa8fabea93f4..7490e91a03c25 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.h +++ b/velox/exec/fuzzer/PrestoQueryRunner.h @@ -86,6 +86,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { std::optional toSql( const std::shared_ptr& projectNode); + std::optional toSql( + const std::shared_ptr& rowNumberNode); + std::string startQuery(const std::string& sql); std::string fetchNext(const std::string& nextUri); diff --git a/velox/exec/fuzzer/RowNumberFuzzer.cpp b/velox/exec/fuzzer/RowNumberFuzzer.cpp new file mode 100644 index 0000000000000..c7a482859c336 --- /dev/null +++ b/velox/exec/fuzzer/RowNumberFuzzer.cpp @@ -0,0 +1,550 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/exec/fuzzer/RowNumberFuzzer.h" +#include +#include +#include "velox/common/file/FileSystems.h" +#include "velox/connectors/hive/HiveConnector.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/dwio/dwrf/reader/DwrfReader.h" +#include "velox/dwio/dwrf/writer/Writer.h" +#include "velox/exec/fuzzer/ReferenceQueryRunner.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" + +DEFINE_int32(steps, 10, "Number of plans to generate and test."); + +DEFINE_int32( + duration_sec, + 0, + "For how long it should run (in seconds). If zero, " + "it executes exactly --steps iterations and exits."); + +DEFINE_int32( + batch_size, + 100, + "The number of elements on each generated vector."); + +DEFINE_int32(num_batches, 10, "The number of generated vectors."); + +DEFINE_double( + null_ratio, + 0.1, + "Chance of adding a null value in a vector " + "(expressed as double from 0 to 1)."); + +DEFINE_bool(enable_spill, true, "Whether to test plans with spilling enabled."); + +DEFINE_bool( + enable_oom_injection, + false, + "When enabled OOMs will randomly be triggered while executing query " + "plans. The goal of this mode is to ensure unexpected exceptions " + "aren't thrown and the process isn't killed in the process of cleaning " + "up after failures. Therefore, results are not compared when this is " + "enabled. Note that this option only works in debug builds."); + +namespace facebook::velox::exec::test { +namespace { + +class RowNumberFuzzer { + public: + explicit RowNumberFuzzer( + size_t initialSeed, + std::unique_ptr); + + void go(); + + struct PlanWithSplits { + core::PlanNodePtr plan; + std::vector> splits; + + explicit PlanWithSplits( + core::PlanNodePtr _plan, + const std::vector>& _splits = + {}) + : plan(std::move(_plan)), splits(_splits) {} + }; + + private: + static VectorFuzzer::Options getFuzzerOptions() { + VectorFuzzer::Options opts; + opts.vectorSize = FLAGS_batch_size; + opts.stringVariableLength = true; + opts.stringLength = 100; + opts.nullRatio = FLAGS_null_ratio; + return opts; + } + + static inline const std::string kHiveConnectorId = "test-hive"; + + // Makes a connector split from a file path on storage. + static std::shared_ptr makeSplit( + const std::string& filePath); + + void seed(size_t seed) { + currentSeed_ = seed; + vectorFuzzer_.reSeed(seed); + rng_.seed(currentSeed_); + } + + void reSeed() { + seed(rng_()); + } + + // Runs one test iteration from query plans generations, executions and result + // verifications. + void verify(); + + int32_t randInt(int32_t min, int32_t max) { + return boost::random::uniform_int_distribution(min, max)(rng_); + } + + std::pair, std::vector> + generatePartitionKeys(); + + std::vector generateInput( + const std::vector& keyNames, + const std::vector& keyTypes); + + std::optional computeReferenceResults( + core::PlanNodePtr& plan, + const std::vector& input); + + RowVectorPtr execute(const PlanWithSplits& plan, bool injectSpill); + + void addPlansWithTableScan( + const std::string& tableDir, + const std::vector& partitionKeys, + const std::vector& input, + std::vector& altPlans); + + // Makes the query plan with default settings in RowNumberFuzzer and value + // inputs for both probe and build sides. + // + // NOTE: 'input' could either input rows with lazy + // vectors or flatten ones. + static PlanWithSplits makeDefaultPlan( + const std::vector& partitionKeys, + const std::vector& input); + + static PlanWithSplits makePlanWithTableScan( + const RowTypePtr& type, + const std::vector& partitionKeys, + const std::vector>& splits); + + FuzzerGenerator rng_; + size_t currentSeed_{0}; + + std::shared_ptr rootPool_{ + memory::memoryManager()->addRootPool( + "rowNumberFuzzer", + memory::kMaxMemory, + memory::MemoryReclaimer::create())}; + std::shared_ptr pool_{rootPool_->addLeafChild( + "rowNumberFuzzerLeaf", + true, + exec::MemoryReclaimer::create())}; + std::shared_ptr writerPool_{rootPool_->addAggregateChild( + "rowNumberFuzzerWriter", + exec::MemoryReclaimer::create())}; + VectorFuzzer vectorFuzzer_; + std::unique_ptr referenceQueryRunner_; +}; + +RowNumberFuzzer::RowNumberFuzzer( + size_t initialSeed, + std::unique_ptr referenceQueryRunner) + : vectorFuzzer_{getFuzzerOptions(), pool_.get()}, + referenceQueryRunner_{std::move(referenceQueryRunner)} { + filesystems::registerLocalFileSystem(); + + // Make sure not to run out of open file descriptors. + const std::unordered_map hiveConfig = { + {connector::hive::HiveConfig::kNumCacheFileHandles, "1000"}}; + auto hiveConnector = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector( + kHiveConnectorId, std::make_shared(hiveConfig)); + connector::registerConnector(hiveConnector); + + seed(initialSeed); +} + +void writeToFile( + const std::string& path, + const VectorPtr& vector, + memory::MemoryPool* pool) { + dwrf::WriterOptions options; + options.schema = vector->type(); + options.memoryPool = pool; + auto writeFile = std::make_unique(path, true, false); + auto sink = + std::make_unique(std::move(writeFile), path); + dwrf::Writer writer(std::move(sink), options); + writer.write(vector); + writer.close(); +} + +// static +std::shared_ptr RowNumberFuzzer::makeSplit( + const std::string& filePath) { + return std::make_shared( + kHiveConnectorId, filePath, dwio::common::FileFormat::DWRF); +} + +template +bool isDone(size_t i, T startTime) { + if (FLAGS_duration_sec > 0) { + std::chrono::duration elapsed = + std::chrono::system_clock::now() - startTime; + return elapsed.count() >= FLAGS_duration_sec; + } + return i >= FLAGS_steps; +} + +std::vector flatten(const std::vector& vectors) { + std::vector flatVectors; + for (const auto& vector : vectors) { + auto flat = BaseVector::create( + vector->type(), vector->size(), vector->pool()); + flat->copy(vector.get(), 0, 0, vector->size()); + flatVectors.push_back(flat); + } + + return flatVectors; +} + +std::pair, std::vector> +RowNumberFuzzer::generatePartitionKeys() { + const auto numKeys = randInt(1, 3); + std::vector names; + std::vector types; + for (auto i = 0; i < numKeys; ++i) { + names.push_back(fmt::format("c{}", i)); + types.push_back(vectorFuzzer_.randType(/*maxDepth=*/1)); + } + return std::make_pair(names, types); +} + +std::vector RowNumberFuzzer::generateInput( + const std::vector& keyNames, + const std::vector& keyTypes) { + std::vector names = keyNames; + std::vector types = keyTypes; + // Add up to 3 payload columns. + const auto numPayload = randInt(0, 3); + for (auto i = 0; i < numPayload; ++i) { + names.push_back(fmt::format("c{}", i + keyNames.size())); + types.push_back(vectorFuzzer_.randType(/*maxDepth=*/2)); + } + + const auto inputType = ROW(std::move(names), std::move(types)); + std::vector input; + input.reserve(FLAGS_num_batches); + for (auto i = 0; i < FLAGS_num_batches; ++i) { + input.push_back(vectorFuzzer_.fuzzInputRow(inputType)); + } + + return input; +} + +RowNumberFuzzer::PlanWithSplits RowNumberFuzzer::makeDefaultPlan( + const std::vector& partitionKeys, + const std::vector& input) { + auto planNodeIdGenerator = std::make_shared(); + std::vector projectFields = partitionKeys; + projectFields.emplace_back("row_number"); + auto plan = PlanBuilder() + .values(input) + .rowNumber(partitionKeys) + .project(projectFields) + .planNode(); + return PlanWithSplits{std::move(plan)}; +} + +bool containsType(const TypePtr& type, const TypePtr& search) { + if (type->equivalent(*search)) { + return true; + } + + for (auto i = 0; i < type->size(); ++i) { + if (containsType(type->childAt(i), search)) { + return true; + } + } + return false; +} + +bool containsTypeKind(const TypePtr& type, const TypeKind& search) { + if (type->kind() == search) { + return true; + } + + for (auto i = 0; i < type->size(); ++i) { + if (containsTypeKind(type->childAt(i), search)) { + return true; + } + } + + return false; +} + +bool containsUnsupportedTypes(const TypePtr& type) { + // Skip queries that use Timestamp, Varbinary, and IntervalDayTime types. + // DuckDB doesn't support nanosecond precision for timestamps or casting from + // Bigint to Interval. + // TODO Investigate mismatches reported when comparing Varbinary. + return containsTypeKind(type, TypeKind::TIMESTAMP) || + containsTypeKind(type, TypeKind::VARBINARY) || + containsType(type, INTERVAL_DAY_TIME()); +} + +std::optional RowNumberFuzzer::computeReferenceResults( + core::PlanNodePtr& plan, + const std::vector& input) { + if (containsUnsupportedTypes(input[0]->type())) { + return std::nullopt; + } + + if (auto sql = referenceQueryRunner_->toSql(plan)) { + return referenceQueryRunner_->execute( + sql.value(), input, plan->outputType()); + } + + LOG(INFO) << "Query not supported by the reference DB"; + return std::nullopt; +} + +RowVectorPtr RowNumberFuzzer::execute( + const PlanWithSplits& plan, + bool injectSpill) { + LOG(INFO) << "Executing query plan: " << plan.plan->toString(true, true); + + AssertQueryBuilder builder(plan.plan); + if (!plan.splits.empty()) { + builder.splits(plan.splits); + } + + std::shared_ptr spillDirectory; + int32_t spillPct{0}; + if (injectSpill) { + spillDirectory = exec::test::TempDirectoryPath::create(); + builder.config(core::QueryConfig::kSpillEnabled, true) + .config(core::QueryConfig::kRowNumberSpillEnabled, true) + .spillDirectory(spillDirectory->getPath()); + spillPct = 10; + } + + ScopedOOMInjector oomInjector( + []() -> bool { return folly::Random::oneIn(10); }, + 10); // Check the condition every 10 ms. + if (FLAGS_enable_oom_injection) { + oomInjector.enable(); + } + + // Wait for the task to be destroyed before start next query execution to + // avoid the potential interference of the background activities across query + // executions. + auto stopGuard = folly::makeGuard([&]() { waitForAllTasksToBeDeleted(); }); + + TestScopedSpillInjection scopedSpillInjection(spillPct); + RowVectorPtr result; + try { + result = builder.copyResults(pool_.get()); + } catch (VeloxRuntimeError& e) { + if (FLAGS_enable_oom_injection && + e.errorCode() == facebook::velox::error_code::kMemCapExceeded && + e.message() == ScopedOOMInjector::kErrorMessage) { + // If we enabled OOM injection we expect the exception thrown by the + // ScopedOOMInjector. + return nullptr; + } + + throw e; + } + + if (VLOG_IS_ON(1)) { + VLOG(1) << std::endl << result->toString(0, result->size()); + } + + return result; +} + +RowNumberFuzzer::PlanWithSplits RowNumberFuzzer::makePlanWithTableScan( + const RowTypePtr& type, + const std::vector& partitionKeys, + const std::vector>& splits) { + std::vector projectFields = partitionKeys; + projectFields.emplace_back("row_number"); + + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId scanId; + auto plan = PlanBuilder(planNodeIdGenerator) + .tableScan(type) + .rowNumber(partitionKeys) + .project(projectFields) + .planNode(); + return PlanWithSplits{plan, splits}; +} + +bool isTableScanSupported(const TypePtr& type) { + if (type->kind() == TypeKind::ROW && type->size() == 0) { + return false; + } + if (type->kind() == TypeKind::UNKNOWN) { + return false; + } + if (type->kind() == TypeKind::HUGEINT) { + return false; + } + // Disable testing with TableScan when input contains TIMESTAMP type, due to + // the issue #8127. + if (type->kind() == TypeKind::TIMESTAMP) { + return false; + } + + for (auto i = 0; i < type->size(); ++i) { + if (!isTableScanSupported(type->childAt(i))) { + return false; + } + } + + return true; +} + +void RowNumberFuzzer::addPlansWithTableScan( + const std::string& tableDir, + const std::vector& partitionKeys, + const std::vector& input, + std::vector& altPlans) { + VELOX_CHECK(!tableDir.empty()); + + if (!isTableScanSupported(input[0]->type())) { + return; + } + + std::vector> inputSplits; + for (auto i = 0; i < input.size(); ++i) { + const std::string filePath = fmt::format("{}/row_number/{}", tableDir, i); + writeToFile(filePath, input[i], writerPool_.get()); + inputSplits.push_back(makeSplit(filePath)); + } + + altPlans.push_back(makePlanWithTableScan( + asRowType(input[0]->type()), partitionKeys, inputSplits)); +} + +void RowNumberFuzzer::verify() { + const auto [keyNames, keyTypes] = generatePartitionKeys(); + const auto input = generateInput(keyNames, keyTypes); + // Flatten inputs. + const auto flatInput = flatten(input); + + if (VLOG_IS_ON(1)) { + VLOG(1) << "Input: " << input[0]->toString(); + for (const auto& v : flatInput) { + VLOG(1) << std::endl << v->toString(0, v->size()); + } + } + + auto defaultPlan = makeDefaultPlan(keyNames, input); + const auto expected = execute(defaultPlan, /*injectSpill=*/false); + + if (expected != nullptr) { + if (const auto referenceResult = + computeReferenceResults(defaultPlan.plan, input)) { + VELOX_CHECK( + assertEqualResults( + referenceResult.value(), + defaultPlan.plan->outputType(), + {expected}), + "Velox and Reference results don't match"); + } + } + + std::vector altPlans; + altPlans.push_back(std::move(defaultPlan)); + + const auto tableScanDir = exec::test::TempDirectoryPath::create(); + addPlansWithTableScan(tableScanDir->getPath(), keyNames, input, altPlans); + + for (auto i = 0; i < altPlans.size(); ++i) { + LOG(INFO) << "Testing plan #" << i; + auto actual = execute(altPlans[i], /*injectSpill=*/false); + if (actual != nullptr && expected != nullptr) { + VELOX_CHECK( + assertEqualResults({expected}, {actual}), + "Logically equivalent plans produced different results"); + } else { + VELOX_CHECK( + FLAGS_enable_oom_injection, "Got unexpected nullptr for results"); + } + + if (FLAGS_enable_spill) { + LOG(INFO) << "Testing plan #" << i << " with spilling"; + actual = execute(altPlans[i], /*=injectSpill=*/true); + if (actual != nullptr && expected != nullptr) { + try { + VELOX_CHECK( + assertEqualResults({expected}, {actual}), + "Logically equivalent plans produced different results"); + } catch (const VeloxException& e) { + LOG(ERROR) << "Expected\n" + << expected->toString(0, expected->size()) << "\nActual\n" + << actual->toString(0, actual->size()); + throw; + } + } else { + VELOX_CHECK( + FLAGS_enable_oom_injection, "Got unexpected nullptr for results"); + } + } + } +} + +void RowNumberFuzzer::go() { + VELOX_USER_CHECK( + FLAGS_steps > 0 || FLAGS_duration_sec > 0, + "Either --steps or --duration_sec needs to be greater than zero.") + VELOX_USER_CHECK_GE(FLAGS_batch_size, 10, "Batch size must be at least 10."); + + const auto startTime = std::chrono::system_clock::now(); + size_t iteration = 0; + + while (!isDone(iteration, startTime)) { + LOG(INFO) << "==============================> Started iteration " + << iteration << " (seed: " << currentSeed_ << ")"; + verify(); + LOG(INFO) << "==============================> Done with iteration " + << iteration; + + reSeed(); + ++iteration; + } +} +} // namespace + +void rowNumberFuzzer( + size_t seed, + std::unique_ptr referenceQueryRunner) { + RowNumberFuzzer(seed, std::move(referenceQueryRunner)).go(); +} +} // namespace facebook::velox::exec::test diff --git a/velox/exec/fuzzer/RowNumberFuzzer.h b/velox/exec/fuzzer/RowNumberFuzzer.h new file mode 100644 index 0000000000000..30cd960e327f4 --- /dev/null +++ b/velox/exec/fuzzer/RowNumberFuzzer.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include "velox/exec/fuzzer/ReferenceQueryRunner.h" + +namespace facebook::velox::exec::test { +void rowNumberFuzzer( + size_t seed, + std::unique_ptr referenceQueryRunner); +} diff --git a/velox/exec/fuzzer/RowNumberFuzzerRunner.h b/velox/exec/fuzzer/RowNumberFuzzerRunner.h new file mode 100644 index 0000000000000..2d018f81d3068 --- /dev/null +++ b/velox/exec/fuzzer/RowNumberFuzzerRunner.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/common/file/FileSystems.h" + +#include "velox/exec/fuzzer/ReferenceQueryRunner.h" +#include "velox/exec/fuzzer/RowNumberFuzzer.h" +#include "velox/serializers/PrestoSerializer.h" + +/// RowNumber FuzzerRunner leverages RowNumberFuzzer and VectorFuzzer to +/// automatically generate and execute tests. It works as follows: +/// +/// 1. Plan Generation: Generate two equivalent query plans, one is row-number +/// over ValuesNode and the other is over TableScanNode. +/// 2. Executes a variety of logically equivalent query plans and checks the +/// results are the same. +/// 3. Rinse and repeat. +/// +/// It is used as follows: +/// +/// $ ./velox_row_number_fuzzer_test --duration_sec 600 +/// +/// The flags that configure RowNumberFuzzer's behavior are: +/// +/// --steps: how many iterations to run. +/// --duration_sec: alternatively, for how many seconds it should run (takes +/// precedence over --steps). +/// --seed: pass a deterministic seed to reproduce the behavior (each iteration +/// will print a seed as part of the logs). +/// --v=1: verbose logging; print a lot more details about the execution. +/// --batch_size: size of input vector batches generated. +/// --num_batches: number if input vector batches to generate. +/// --enable_spill: test plans with spilling enabled. +/// --enable_oom_injection: randomly trigger OOM while executing query plans. +/// e.g: +/// +/// $ ./velox_row_number_fuzzer_test \ +/// --seed 123 \ +/// --duration_sec 600 \ +/// --v=1 + +namespace facebook::velox::exec::test { + +class RowNumberFuzzerRunner { + public: + static int run( + size_t seed, + std::unique_ptr referenceQueryRunner) { + serializer::presto::PrestoVectorSerde::registerVectorSerde(); + filesystems::registerLocalFileSystem(); + rowNumberFuzzer(seed, std::move(referenceQueryRunner)); + return RUN_ALL_TESTS(); + } +}; + +} // namespace facebook::velox::exec::test diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp index 73f3c4e2494e7..5ce2a7730f751 100644 --- a/velox/exec/fuzzer/WindowFuzzer.cpp +++ b/velox/exec/fuzzer/WindowFuzzer.cpp @@ -166,8 +166,7 @@ std::string WindowFuzzer::generateOrderByClause( frame << ", "; } frame << sortingKeysAndOrders[i].key_ << " " - << sortingKeysAndOrders[i].order_ << " " - << sortingKeysAndOrders[i].nullsOrder_; + << sortingKeysAndOrders[i].sortOrder_.toString(); } return frame.str(); } @@ -193,11 +192,10 @@ WindowFuzzer::generateSortingKeysAndOrders( std::vector& types) { auto keys = generateSortingKeys(prefix, names, types); std::vector results; - // TODO: allow randomly generating orders. for (auto i = 0; i < keys.size(); ++i) { - std::string order = "asc"; - std::string nullsOrder = "nulls last"; - results.push_back(SortingKeyAndOrder(keys[i], order, nullsOrder)); + auto asc = vectorFuzzer_.coinToss(0.5); + auto nullsFirst = vectorFuzzer_.coinToss(0.5); + results.emplace_back(keys[i], core::SortOrder(asc, nullsFirst)); } return results; } @@ -250,8 +248,7 @@ void WindowFuzzer::go() { // If the function is order-dependent or uses "rows" frame, sort all input // rows by row_number additionally. if (requireSortedInput || isRowsFrame) { - sortingKeysAndOrders.push_back( - SortingKeyAndOrder("row_number", "asc", "nulls last")); + sortingKeysAndOrders.emplace_back("row_number", core::kAscNullsLast); ++stats_.numSortedInputs; } @@ -301,20 +298,16 @@ void WindowFuzzer::testAlternativePlans( const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - const velox::test::ResultOrError& expected) { + const velox::fuzzer::ResultOrError& expected) { std::vector plans; std::vector allKeys; for (const auto& key : partitionKeys) { - allKeys.push_back(key + " NULLS FIRST"); + allKeys.emplace_back(key + " NULLS FIRST"); } for (const auto& keyAndOrder : sortingKeysAndOrders) { - allKeys.push_back(folly::to( - keyAndOrder.key_, - " ", - keyAndOrder.order_, - " ", - keyAndOrder.nullsOrder_)); + allKeys.emplace_back(fmt::format( + "{} {}", keyAndOrder.key_, keyAndOrder.sortOrder_.toString())); } // Streaming window from values. @@ -400,7 +393,7 @@ bool WindowFuzzer::verifyWindow( persistReproInfo({{plan, {}}}, reproPersistPath_); } - velox::test::ResultOrError resultOrError; + velox::fuzzer::ResultOrError resultOrError; try { resultOrError = execute(plan); if (resultOrError.exceptionPtr) { diff --git a/velox/exec/fuzzer/WindowFuzzer.h b/velox/exec/fuzzer/WindowFuzzer.h index a70b2c67deb85..bf36de8a9e3de 100644 --- a/velox/exec/fuzzer/WindowFuzzer.h +++ b/velox/exec/fuzzer/WindowFuzzer.h @@ -77,20 +77,11 @@ class WindowFuzzer : public AggregationFuzzerBase { private: struct SortingKeyAndOrder { - std::string key_; - std::string order_; - std::string nullsOrder_; - - SortingKeyAndOrder() = delete; - - SortingKeyAndOrder( - const std::string& key, - const std::string& order, - const std::string& nullsOrder) { - key_ = key; - order_ = order; - nullsOrder_ = nullsOrder; - } + const std::string key_; + const core::SortOrder sortOrder_; + + SortingKeyAndOrder(std::string key, core::SortOrder sortOrder) + : key_(std::move(key)), sortOrder_(std::move(sortOrder)) {} }; void addWindowFunctionSignatures(const WindowFunctionMap& signatureMap); @@ -131,7 +122,7 @@ class WindowFuzzer : public AggregationFuzzerBase { const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - const velox::test::ResultOrError& expected); + const velox::fuzzer::ResultOrError& expected); const std::unordered_set orderDependentFunctions_; diff --git a/velox/exec/fuzzer/WindowFuzzerRunner.h b/velox/exec/fuzzer/WindowFuzzerRunner.h index 16c512b9a51ca..147ea5471a222 100644 --- a/velox/exec/fuzzer/WindowFuzzerRunner.h +++ b/velox/exec/fuzzer/WindowFuzzerRunner.h @@ -26,7 +26,7 @@ #include "velox/exec/Aggregate.h" #include "velox/exec/fuzzer/AggregationFuzzerOptions.h" #include "velox/exec/fuzzer/WindowFuzzer.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/parse/TypeResolver.h" #include "velox/serializers/PrestoSerializer.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -63,9 +63,9 @@ class WindowFuzzerRunner { exit(1); } - auto filteredAggregationSignatures = velox::test::filterSignatures( + auto filteredAggregationSignatures = velox::fuzzer::filterSignatures( aggregationSignatures, options.onlyFunctions, options.skipFunctions); - auto filteredWindowSignatures = velox::test::filterSignatures( + auto filteredWindowSignatures = velox::fuzzer::filterSignatures( windowSignatures, options.onlyFunctions, options.skipFunctions); if (filteredAggregationSignatures.empty() && filteredWindowSignatures.empty()) { diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt index ddfb25743d23a..6b1f0f86b7da8 100644 --- a/velox/exec/tests/CMakeLists.txt +++ b/velox/exec/tests/CMakeLists.txt @@ -207,6 +207,12 @@ add_library(velox_join_fuzzer JoinFuzzer.cpp) target_link_libraries(velox_join_fuzzer velox_type velox_vector_fuzzer velox_exec_test_lib velox_expression_test_utility) +# RowNumber Fuzzer. +add_executable(velox_row_number_fuzzer_test RowNumberFuzzerTest.cpp) + +target_link_libraries(velox_row_number_fuzzer_test velox_row_number_fuzzer + gtest gtest_main) + add_executable(velox_join_fuzzer_test JoinFuzzerTest.cpp) target_link_libraries(velox_join_fuzzer_test velox_join_fuzzer gtest gtest_main) diff --git a/velox/exec/tests/DriverTest.cpp b/velox/exec/tests/DriverTest.cpp index 5601cbe2e3119..990f8a3c75b19 100644 --- a/velox/exec/tests/DriverTest.cpp +++ b/velox/exec/tests/DriverTest.cpp @@ -26,6 +26,7 @@ #include "velox/exec/tests/utils/Cursor.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/functions/Udf.h" using namespace facebook::velox; using namespace facebook::velox::exec; @@ -1519,6 +1520,44 @@ DEBUG_ONLY_TEST_F(DriverTest, driverCpuTimeSlicingCheck) { } } +namespace { + +template +struct ThrowRuntimeExceptionFunction { + template + void call(TResult& out, const TInput& in) { + VELOX_CHECK(false, "Throwing exception"); + } +}; +} // namespace + +TEST_F(DriverTest, additionalContextInRuntimeException) { + // Ensures that exceptions thrown during execution of an operator contain the + // expected context. This is done by executing a plan using project filter + // that uses expressions which setup hierarchical contexts. Finally, we verify + // that all essential context are present. + auto vector = makeRowVector({makeFlatVector({1, 2, 3, 4, 5, 6})}); + registerFunction( + {"throwException"}); + auto op = PlanBuilder() + .values({vector}) + .project({"c0 + throwException(c0)"}) + .planNode(); + try { + assertQuery(op, vector); + } catch (VeloxException& e) { + ASSERT_EQ(e.context(), "throwexception(c0)"); + auto additionalContext = e.additionalContext(); + // Remove the string following `TaskId` from the additional context since + // its indeterministic. + additionalContext.resize(additionalContext.find(" TaskId:")); + ASSERT_EQ( + additionalContext, + "Top-level Expression: plus(c0, throwexception(c0)) Operator: " + "FilterProject(1) PlanNodeId: 1"); + } +} + class OpCallStatusTest : public OperatorTestBase {}; // Test that the opCallStatus is returned properly and formats the call as diff --git a/velox/exec/tests/ExchangeClientTest.cpp b/velox/exec/tests/ExchangeClientTest.cpp index 68021897b8ae6..4deae1b71fec6 100644 --- a/velox/exec/tests/ExchangeClientTest.cpp +++ b/velox/exec/tests/ExchangeClientTest.cpp @@ -423,5 +423,52 @@ TEST_F(ExchangeClientTest, sourceTimeout) { test::testingShutdownLocalExchangeSource(); } +TEST_F(ExchangeClientTest, callNextAfterClose) { + constexpr int32_t kNumSources = 3; + common::testutil::TestValue::enable(); + auto client = + std::make_shared("test", 17, 1 << 20, pool(), executor()); + + bool atEnd; + ContinueFuture future; + auto pages = client->next(1, &atEnd, &future); + ASSERT_EQ(0, pages.size()); + ASSERT_FALSE(atEnd); + + for (auto i = 0; i < kNumSources; ++i) { + client->addRemoteTaskId(fmt::format("local://{}", i)); + } + client->noMoreRemoteTasks(); + + // Fetch a page. No page is found. All sources are fetching. + pages = client->next(1, &atEnd, &future); + EXPECT_TRUE(pages.empty()); + + const auto& queue = client->queue(); + for (auto i = 0; i < 10; ++i) { + enqueue(*queue, makePage(1'000 + i)); + } + + // Fetch multiple pages. Each page is slightly larger than 1K bytes, hence, + // only 4 pages fit. + pages = client->next(5'000, &atEnd, &future); + EXPECT_EQ(4, pages.size()); + EXPECT_FALSE(atEnd); + + // Close the client and try calling next again. + client->close(); + + // Here we should have no pages returned, be at end (we are closed) and the + // future should be invalid (not based on a valid promise). + ContinueFuture futureFinal{ContinueFuture::makeEmpty()}; + pages = client->next(10'000, &atEnd, &futureFinal); + EXPECT_EQ(0, pages.size()); + EXPECT_TRUE(atEnd); + EXPECT_FALSE(futureFinal.valid()); + + client->close(); + test::testingShutdownLocalExchangeSource(); +} + } // namespace } // namespace facebook::velox::exec diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp index 601a0d90a5e51..7bf3b87496ef7 100644 --- a/velox/exec/tests/HashJoinTest.cpp +++ b/velox/exec/tests/HashJoinTest.cpp @@ -203,7 +203,7 @@ std::pair numTaskSpillFiles(const exec::Task& task) { void abortPool(memory::MemoryPool* pool) { try { VELOX_FAIL("Manual MemoryPool Abortion"); - } catch (const VeloxException& error) { + } catch (const VeloxException&) { pool->abort(std::current_exception()); } } @@ -4041,6 +4041,71 @@ TEST_F(HashJoinTest, lazyVectors) { } } +TEST_F(HashJoinTest, lazyVectorNotLoadedInFilter) { + // Ensure that if lazy vectors are temporarily wrapped during a filter's + // execution and remain unloaded, the temporary wrap is promptly + // discarded. This precaution prevents the generation of the probe's output + // from wrapping an unloaded vector while the temporary wrap is + // still alive. + // This is done by generating a sufficiently small batch to allow the lazy + // vector to remain unloaded, as it doesn't need to be split between batches. + // Then we use a filter that skips the execution of the expression containing + // the lazy vector, thereby avoiding its loading. + const vector_size_t vectorSize = 1'000; + auto probeVectors = makeBatches(1, [&](int32_t /*unused*/) { + return makeRowVector( + {makeFlatVector(vectorSize, folly::identity), + makeFlatVector(vectorSize, [](auto row) { return row % 23; }), + makeFlatVector( + vectorSize, [](auto row) { return row % 31; })}); + }); + + std::vector buildVectors = + makeBatches(1, [&](int32_t /*unused*/) { + return makeRowVector({makeFlatVector( + vectorSize, [](auto row) { return row * 3; })}); + }); + + std::shared_ptr probeFile = TempFilePath::create(); + writeToFile(probeFile->getPath(), probeVectors); + + std::shared_ptr buildFile = TempFilePath::create(); + writeToFile(buildFile->getPath(), buildVectors); + + createDuckDbTable("t", probeVectors); + createDuckDbTable("u", buildVectors); + + // Lazy vector is part of the filter but never gets loaded. + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId probeScanId; + core::PlanNodeId buildScanId; + auto op = PlanBuilder(planNodeIdGenerator) + .tableScan(asRowType(probeVectors[0]->type())) + .capturePlanNodeId(probeScanId) + .hashJoin( + {"c0"}, + {"c0"}, + PlanBuilder(planNodeIdGenerator) + .tableScan(asRowType(buildVectors[0]->type())) + .capturePlanNodeId(buildScanId) + .planNode(), + "c1 >= 0 OR c2 > 0", + {"c1", "c2"}) + .planNode(); + SplitInput splitInput = { + {probeScanId, + {exec::Split(makeHiveConnectorSplit(probeFile->getPath()))}}, + {buildScanId, + {exec::Split(makeHiveConnectorSplit(buildFile->getPath()))}}, + }; + HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get()) + .planNode(std::move(op)) + .inputSplits(splitInput) + .checkSpillStats(false) + .referenceQuery("SELECT t.c1, t.c2 FROM t, u WHERE t.c0 = u.c0") + .run(); +} + TEST_F(HashJoinTest, dynamicFilters) { const int32_t numSplits = 10; const int32_t numRowsProbe = 333; @@ -4111,6 +4176,7 @@ TEST_F(HashJoinTest, dynamicFilters) { { // Inner join. core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType) .capturePlanNodeId(probeScanId) @@ -4121,6 +4187,7 @@ TEST_F(HashJoinTest, dynamicFilters) { "", {"c0", "c1", "u_c1"}, core::JoinType::kInner) + .capturePlanNodeId(joinId) .project({"c0", "c1 + 1", "c1 + u_c1"}) .planNode(); { @@ -4131,16 +4198,21 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4157,6 +4229,7 @@ TEST_F(HashJoinTest, dynamicFilters) { "", {"c0", "c1"}, core::JoinType::kLeftSemiFilter) + .capturePlanNodeId(joinId) .project({"c0", "c1 + 1"}) .planNode(); @@ -4168,17 +4241,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c0, t.c1 + 1 FROM t WHERE t.c0 IN (SELECT c0 FROM u)") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4195,6 +4273,7 @@ TEST_F(HashJoinTest, dynamicFilters) { "", {"u_c0", "u_c1"}, core::JoinType::kRightSemiFilter) + .capturePlanNodeId(joinId) .project({"u_c0", "u_c1 + 1"}) .planNode(); @@ -4206,17 +4285,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT u.c0, u.c1 + 1 FROM u WHERE u.c0 IN (SELECT c0 FROM t)") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4232,6 +4316,7 @@ TEST_F(HashJoinTest, dynamicFilters) { assignments["b"] = regularColumn("c1", BIGINT()); core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .startTableScan() .outputType(scanOutputType) @@ -4239,6 +4324,7 @@ TEST_F(HashJoinTest, dynamicFilters) { .endTableScan() .capturePlanNodeId(probeScanId) .hashJoin({"a"}, {"u_c0"}, buildSide, "", {"a", "b", "u_c1"}) + .capturePlanNodeId(joinId) .project({"a", "b + 1", "b + u_c1"}) .planNode(); @@ -4249,17 +4335,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4268,10 +4359,12 @@ TEST_F(HashJoinTest, dynamicFilters) { // Push-down that requires merging filters. { core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType, {"c0 < 500::INTEGER"}) .capturePlanNodeId(probeScanId) .hashJoin({"c0"}, {"u_c0"}, buildSide, "", {"c1", "u_c1"}) + .capturePlanNodeId(joinId) .project({"c1 + u_c1"}) .planNode(); @@ -4282,17 +4375,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 500") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4301,11 +4399,13 @@ TEST_F(HashJoinTest, dynamicFilters) { // Push-down that turns join into a no-op. { core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType) .capturePlanNodeId(probeScanId) .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c0", "c1"}) + .capturePlanNodeId(joinId) .project({"c0", "c1 + 1"}) .planNode(); @@ -4315,12 +4415,14 @@ TEST_F(HashJoinTest, dynamicFilters) { .referenceQuery("SELECT t.c0, t.c1 + 1 FROM t, u WHERE t.c0 = u.c0") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); @@ -4328,6 +4430,9 @@ TEST_F(HashJoinTest, dynamicFilters) { getReplacedWithFilterRows(task, 1).sum, numRowsBuild * numSplits); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4337,10 +4442,12 @@ TEST_F(HashJoinTest, dynamicFilters) { // number of columns than the input. { core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType) .capturePlanNodeId(probeScanId) .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c0"}) + .capturePlanNodeId(joinId) .planNode(); HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get()) @@ -4349,12 +4456,14 @@ TEST_F(HashJoinTest, dynamicFilters) { .referenceQuery("SELECT t.c0 FROM t JOIN u ON (t.c0 = u.c0)") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); @@ -4362,6 +4471,9 @@ TEST_F(HashJoinTest, dynamicFilters) { getReplacedWithFilterRows(task, 1).sum, numRowsBuild * numSplits); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4370,10 +4482,12 @@ TEST_F(HashJoinTest, dynamicFilters) { // Push-down that requires merging filters and turns join into a no-op. { core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType, {"c0 < 500::INTEGER"}) .capturePlanNodeId(probeScanId) .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c1"}) + .capturePlanNodeId(joinId) .project({"c1 + 1"}) .planNode(); @@ -4384,17 +4498,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 500") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4404,12 +4523,14 @@ TEST_F(HashJoinTest, dynamicFilters) { { // Inner join. core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType, {"c0 < 200::INTEGER"}) .capturePlanNodeId(probeScanId) .hashJoin( {"c0"}, {"u_c0"}, buildSide, "", {"c1"}, core::JoinType::kInner) + .capturePlanNodeId(joinId) .project({"c1 + 1"}) .planNode(); @@ -4421,17 +4542,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 200") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4448,6 +4574,7 @@ TEST_F(HashJoinTest, dynamicFilters) { "", {"c1"}, core::JoinType::kLeftSemiFilter) + .capturePlanNodeId(joinId) .project({"c1 + 1"}) .planNode(); @@ -4459,17 +4586,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT t.c1 + 1 FROM t WHERE t.c0 IN (SELECT c0 FROM u) AND t.c0 < 200") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4486,6 +4618,7 @@ TEST_F(HashJoinTest, dynamicFilters) { "", {"u_c1"}, core::JoinType::kRightSemiFilter) + .capturePlanNodeId(joinId) .project({"u_c1 + 1"}) .planNode(); @@ -4497,17 +4630,22 @@ TEST_F(HashJoinTest, dynamicFilters) { "SELECT u.c1 + 1 FROM u WHERE u.c0 IN (SELECT c0 FROM t) AND u.c0 < 200") .verifier([&](const std::shared_ptr& task, bool hasSpill) { SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill)); + auto planStats = toPlanStats(task->taskStats()); if (hasSpill) { // Dynamic filtering should be disabled with spilling triggered. ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); } else { ASSERT_EQ(1, getFiltersProduced(task, 1).sum); ASSERT_EQ(1, getFiltersAccepted(task, 0).sum); ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0); ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId})); } }) .run(); @@ -4516,9 +4654,11 @@ TEST_F(HashJoinTest, dynamicFilters) { // Disable filter push-down by using values in place of scan. { + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .values(probeVectors) .hashJoin({"c0"}, {"u_c0"}, buildSide, "", {"c1"}) + .capturePlanNodeId(joinId) .project({"c1 + 1"}) .planNode(); @@ -4526,6 +4666,7 @@ TEST_F(HashJoinTest, dynamicFilters) { .planNode(std::move(op)) .referenceQuery("SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0") .verifier([&](const std::shared_ptr& task, bool hasSpill) { + auto planStats = toPlanStats(task->taskStats()); ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(numRowsProbe * numSplits, getInputPositions(task, 1)); @@ -4537,11 +4678,13 @@ TEST_F(HashJoinTest, dynamicFilters) { // probe side. { core::PlanNodeId probeScanId; + core::PlanNodeId joinId; auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) .tableScan(probeType) .capturePlanNodeId(probeScanId) .project({"cast(c0 + 1 as integer) AS t_key", "c1"}) .hashJoin({"t_key"}, {"u_c0"}, buildSide, "", {"c1"}) + .capturePlanNodeId(joinId) .project({"c1 + 1"}) .planNode(); @@ -4550,14 +4693,113 @@ TEST_F(HashJoinTest, dynamicFilters) { .makeInputSplits(makeInputSplits(probeScanId)) .referenceQuery("SELECT t.c1 + 1 FROM t, u WHERE (t.c0 + 1) = u.c0") .verifier([&](const std::shared_ptr& task, bool hasSpill) { + auto planStats = toPlanStats(task->taskStats()); ASSERT_EQ(0, getFiltersProduced(task, 1).sum); ASSERT_EQ(0, getFiltersAccepted(task, 0).sum); ASSERT_EQ(numRowsProbe * numSplits, getInputPositions(task, 1)); + ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty()); }) .run(); } } +TEST_F(HashJoinTest, dynamicFiltersStatsWithChainedJoins) { + const int32_t numSplits = 10; + const int32_t numProbeRows = 333; + const int32_t numBuildRows = 100; + + std::vector probeVectors; + probeVectors.reserve(numSplits); + std::vector> tempFiles; + for (int32_t i = 0; i < numSplits; ++i) { + auto rowVector = makeRowVector({ + makeFlatVector( + numProbeRows, [&](auto row) { return row - i * 10; }), + makeFlatVector(numProbeRows, [](auto row) { return row; }), + }); + probeVectors.push_back(rowVector); + tempFiles.push_back(TempFilePath::create()); + writeToFile(tempFiles.back()->getPath(), rowVector); + } + auto makeInputSplits = [&](const core::PlanNodeId& nodeId) { + return [&] { + std::vector probeSplits; + for (auto& file : tempFiles) { + probeSplits.push_back( + exec::Split(makeHiveConnectorSplit(file->getPath()))); + } + SplitInput splits; + splits.emplace(nodeId, probeSplits); + return splits; + }; + }; + + // 100 key values in [35, 233] range. + std::vector buildVectors; + for (int i = 0; i < 5; ++i) { + buildVectors.push_back(makeRowVector({ + makeFlatVector( + numBuildRows / 5, + [i](auto row) { return 35 + 2 * (row + i * numBuildRows / 5); }), + makeFlatVector(numBuildRows / 5, [](auto row) { return row; }), + })); + } + + createDuckDbTable("t", probeVectors); + createDuckDbTable("u", buildVectors); + + auto probeType = ROW({"c0", "c1"}, {INTEGER(), BIGINT()}); + + auto planNodeIdGenerator = std::make_shared(); + + auto buildSide1 = PlanBuilder(planNodeIdGenerator, pool_.get()) + .values(buildVectors) + .project({"c0 AS u_c0", "c1 AS u_c1"}) + .planNode(); + auto buildSide2 = PlanBuilder(planNodeIdGenerator, pool_.get()) + .values(buildVectors) + .project({"c0 AS u_c0", "c1 AS u_c1"}) + .planNode(); + // Inner join pushdown. + core::PlanNodeId probeScanId; + core::PlanNodeId joinId1; + core::PlanNodeId joinId2; + auto op = PlanBuilder(planNodeIdGenerator, pool_.get()) + .tableScan(probeType) + .capturePlanNodeId(probeScanId) + .hashJoin( + {"c0"}, + {"u_c0"}, + buildSide1, + "", + {"c0", "c1"}, + core::JoinType::kInner) + .capturePlanNodeId(joinId1) + .hashJoin( + {"c0"}, + {"u_c0"}, + buildSide2, + "", + {"c0", "c1", "u_c1"}, + core::JoinType::kInner) + .capturePlanNodeId(joinId2) + .project({"c0", "c1 + 1", "c1 + u_c1"}) + .planNode(); + HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get()) + .planNode(std::move(op)) + .makeInputSplits(makeInputSplits(probeScanId)) + .injectSpill(false) + .referenceQuery( + "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0") + .verifier([&](const std::shared_ptr& task, bool /*unused*/) { + auto planStats = toPlanStats(task->taskStats()); + ASSERT_EQ( + planStats.at(probeScanId).dynamicFilterStats.producerNodeIds, + std::unordered_set({joinId1, joinId2})); + }) + .run(); +} + TEST_F(HashJoinTest, dynamicFiltersWithSkippedSplits) { const int32_t numSplits = 20; const int32_t numNonSkippedSplits = 10; diff --git a/velox/exec/tests/JoinFuzzer.cpp b/velox/exec/tests/JoinFuzzer.cpp index 5db955780c3ac..f1c534339824b 100644 --- a/velox/exec/tests/JoinFuzzer.cpp +++ b/velox/exec/tests/JoinFuzzer.cpp @@ -1020,7 +1020,7 @@ void JoinFuzzer::verify(core::JoinType joinType) { VELOX_CHECK( assertEqualResults({expected}, {actual}), "Logically equivalent plans produced different results"); - } catch (const VeloxException& e) { + } catch (const VeloxException&) { LOG(ERROR) << "Expected\n" << expected->toString(0, expected->size()) << "\nActual\n" << actual->toString(0, actual->size()); diff --git a/velox/exec/tests/OperatorUtilsTest.cpp b/velox/exec/tests/OperatorUtilsTest.cpp index cc192928e1e46..543b619dd90ea 100644 --- a/velox/exec/tests/OperatorUtilsTest.cpp +++ b/velox/exec/tests/OperatorUtilsTest.cpp @@ -464,3 +464,30 @@ TEST_F(OperatorUtilsTest, memStatsFromPool) { ASSERT_EQ(stats.peakSystemMemoryReservation, 0); ASSERT_EQ(stats.numMemoryAllocations, 1); } + +TEST_F(OperatorUtilsTest, dynamicFilterStats) { + DynamicFilterStats dynamicFilterStats; + ASSERT_TRUE(dynamicFilterStats.empty()); + const std::string nodeId1{"node1"}; + const std::string nodeId2{"node2"}; + dynamicFilterStats.producerNodeIds.emplace(nodeId1); + ASSERT_FALSE(dynamicFilterStats.empty()); + DynamicFilterStats dynamicFilterStatsToMerge; + dynamicFilterStatsToMerge.producerNodeIds.emplace(nodeId1); + ASSERT_FALSE(dynamicFilterStatsToMerge.empty()); + dynamicFilterStats.add(dynamicFilterStatsToMerge); + ASSERT_EQ(dynamicFilterStats.producerNodeIds.size(), 1); + ASSERT_EQ( + dynamicFilterStats.producerNodeIds, + std::unordered_set({nodeId1})); + + dynamicFilterStatsToMerge.producerNodeIds.emplace(nodeId2); + dynamicFilterStats.add(dynamicFilterStatsToMerge); + ASSERT_EQ(dynamicFilterStats.producerNodeIds.size(), 2); + ASSERT_EQ( + dynamicFilterStats.producerNodeIds, + std::unordered_set({nodeId1, nodeId2})); + + dynamicFilterStats.clear(); + ASSERT_TRUE(dynamicFilterStats.empty()); +} diff --git a/velox/exec/tests/PlanBuilderTest.cpp b/velox/exec/tests/PlanBuilderTest.cpp index 8d9ec1fa48935..31abd5fd3a611 100644 --- a/velox/exec/tests/PlanBuilderTest.cpp +++ b/velox/exec/tests/PlanBuilderTest.cpp @@ -239,4 +239,11 @@ TEST_F(PlanBuilderTest, windowFrame) { .planNode(), "Window frame of type RANGE PRECEDING or FOLLOWING requires single sorting key in ORDER BY"); } + +TEST_F(PlanBuilderTest, missingOutputType) { + VELOX_ASSERT_THROW( + PlanBuilder().startTableScan().endTableScan(), + "outputType must be specified"); +} + } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/PrintPlanWithStatsTest.cpp b/velox/exec/tests/PrintPlanWithStatsTest.cpp index 93f7c6aa2ecc9..8c8fef0b7b460 100644 --- a/velox/exec/tests/PrintPlanWithStatsTest.cpp +++ b/velox/exec/tests/PrintPlanWithStatsTest.cpp @@ -138,7 +138,7 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) { {" HashBuild: Input: 100 rows \\(.+\\), Output: 0 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+ Memory allocations: .+, Threads: 1"}, {" HashProbe: Input: 2000 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1"}, {" -- TableScan\\[table: hive_table\\] -> c0:INTEGER, c1:BIGINT"}, - {" Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20"}, + {" Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20, DynamicFilter producer plan nodes: 3"}, {" -- Project\\[expressions: \\(u_c0:INTEGER, ROW\\[\"c0\"\\]\\), \\(u_c1:BIGINT, ROW\\[\"c1\"\\]\\)\\] -> u_c0:INTEGER, u_c1:BIGINT"}, {" Output: 100 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: 0B, Memory allocations: .+, Threads: 1"}, {" -- Values\\[100 rows in 1 vectors\\] -> c0:INTEGER, c1:BIGINT"}, @@ -184,7 +184,7 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) { {" runningFinishWallNanos\\s+sum: .+, count: 1, min: .+, max: .+"}, {" runningGetOutputWallNanos\\s+sum: .+, count: 1, min: .+, max: .+"}, {" -- TableScan\\[table: hive_table\\] -> c0:INTEGER, c1:BIGINT"}, - {" Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20"}, + {" Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20, DynamicFilter producer plan nodes: 3"}, {" dataSourceAddSplitWallNanos[ ]* sum: .+, count: 1, min: .+, max: .+"}, {" dataSourceReadWallNanos[ ]* sum: .+, count: 1, min: .+, max: .+"}, {" dynamicFiltersAccepted[ ]* sum: 1, count: 1, min: 1, max: 1"}, diff --git a/velox/exec/tests/RowNumberFuzzerTest.cpp b/velox/exec/tests/RowNumberFuzzerTest.cpp new file mode 100644 index 0000000000000..3abdc9fd3e767 --- /dev/null +++ b/velox/exec/tests/RowNumberFuzzerTest.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "velox/common/memory/SharedArbitrator.h" +#include "velox/connectors/hive/HiveConnector.h" +#include "velox/exec/MemoryReclaimer.h" +#include "velox/exec/fuzzer/DuckQueryRunner.h" +#include "velox/exec/fuzzer/PrestoQueryRunner.h" +#include "velox/exec/fuzzer/ReferenceQueryRunner.h" +#include "velox/exec/fuzzer/RowNumberFuzzerRunner.h" + +DEFINE_int64( + seed, + 0, + "Initial seed for random number generator used to reproduce previous " + "results (0 means start with random seed)."); + +DEFINE_string( + presto_url, + "", + "Presto coordinator URI along with port. If set, we use Presto " + "source of truth. Otherwise, use DuckDB. Example: " + "--presto_url=http://127.0.0.1:8080"); + +DEFINE_uint32( + req_timeout_ms, + 1000, + "Timeout in milliseconds for HTTP requests made to reference DB, " + "such as Presto. Example: --req_timeout_ms=2000"); + +using namespace facebook::velox::exec; + +namespace { +std::unique_ptr setupReferenceQueryRunner( + const std::string& prestoUrl, + const std::string& runnerName, + const uint32_t& reqTimeoutMs) { + if (prestoUrl.empty()) { + auto duckQueryRunner = std::make_unique(); + LOG(INFO) << "Using DuckDB as the reference DB."; + return duckQueryRunner; + } + + LOG(INFO) << "Using Presto as the reference DB."; + return std::make_unique( + prestoUrl, + runnerName, + static_cast(reqTimeoutMs)); +} + +// Invoked to set up memory system with arbitration. +void setupMemory() { + FLAGS_velox_enable_memory_usage_track_in_default_memory_pool = true; + FLAGS_velox_memory_leak_check_enabled = true; + facebook::velox::memory::SharedArbitrator::registerFactory(); + facebook::velox::memory::MemoryManagerOptions options; + options.allocatorCapacity = 8L << 30; + options.arbitratorCapacity = 6L << 30; + options.arbitratorKind = "SHARED"; + options.checkUsageLeak = true; + options.arbitrationStateCheckCb = memoryArbitrationStateCheck; + facebook::velox::memory::MemoryManager::initialize(options); +} +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + // Calls common init functions in the necessary order, initializing + // singletons, installing proper signal handlers for better debugging + // experience, and initialize glog and gflags. + folly::Init init(&argc, &argv); + setupMemory(); + auto referenceQueryRunner = setupReferenceQueryRunner( + FLAGS_presto_url, "row_number_fuzzer", FLAGS_req_timeout_ms); + const size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; + return test::RowNumberFuzzerRunner::run( + initialSeed, std::move(referenceQueryRunner)); +} diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 400e55c767581..954d3c9804821 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -267,6 +267,8 @@ TEST_F(TableScanTest, allColumns) { ASSERT_TRUE(it != planStats.end()); ASSERT_TRUE(it->second.peakMemoryBytes > 0); ASSERT_LT(0, it->second.customStats.at("ioWaitNanos").sum); + // Verifies there is no dynamic filter stats. + ASSERT_TRUE(it->second.dynamicFilterStats.empty()); } TEST_F(TableScanTest, connectorStats) { diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 3edd7c0996f9a..73d7828c71942 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -135,6 +135,7 @@ PlanBuilder& PlanBuilder::tpchTableScan( } core::PlanNodePtr PlanBuilder::TableScanBuilder::build(core::PlanNodeId id) { + VELOX_CHECK_NOT_NULL(outputType_, "outputType must be specified"); std::unordered_map typedMapping; bool hasAssignments = !(assignments_.empty()); for (uint32_t i = 0; i < outputType_->size(); ++i) { diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index 5adf218555a0f..0c6928216a1f4 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -160,6 +160,10 @@ class PlanBuilder { /// Helper class to build a custom TableScanNode. /// Uses a planBuilder instance to get the next plan id, memory pool, and /// parse options. + /// + /// Uses the hive connector by default. Specify outputType, tableHandle, and + /// assignments for other connectors. If these three are specified, all other + /// builder arguments will be ignored. class TableScanBuilder { public: TableScanBuilder(PlanBuilder& builder) : planBuilder_(builder) {} @@ -177,6 +181,7 @@ class PlanBuilder { } /// @param outputType List of column names and types to read from the table. + /// This property is required. TableScanBuilder& outputType(RowTypePtr outputType) { outputType_ = std::move(outputType); return *this; diff --git a/velox/experimental/wave/common/Block.cuh b/velox/experimental/wave/common/Block.cuh index 1ab27fcc265b9..1b1281f276452 100644 --- a/velox/experimental/wave/common/Block.cuh +++ b/velox/experimental/wave/common/Block.cuh @@ -16,39 +16,98 @@ #pragma once +#include #include #include +#include +#include "velox/experimental/wave/common/CudaUtil.cuh" /// Utilities for booleans and indices and thread blocks. namespace facebook::velox::wave { +/// Converts an array of flags to an array of indices of set flags. The first +/// index is given by 'start'. The number of indices is returned in 'size', i.e. +/// this is 1 + the index of the last set flag. template < + typename T, int32_t blockSize, + cub::BlockScanAlgorithm Algorithm = cub::BLOCK_SCAN_RAKING> +inline int32_t __device__ __host__ boolToIndicesSharedSize() { + typedef cub::BlockScan BlockScanT; + + return sizeof(typename BlockScanT::TempStorage); +} + +/// Converts an array of flags to an array of indices of set flags. The first +/// index is given by 'start'. The number of indices is returned in 'size', i.e. +/// this is 1 + the index of the last set flag. +template < + int32_t blockSize, + typename T, cub::BlockScanAlgorithm Algorithm = cub::BLOCK_SCAN_RAKING, typename Getter> -__device__ inline void boolBlockToIndices( - Getter getter, - int32_t start, - int32_t* indices, - void* shmem, - int32_t& size) { - typedef cub::BlockScan BlockScanT; +__device__ inline void +boolBlockToIndices(Getter getter, T start, T* indices, void* shmem, T& size) { + typedef cub::BlockScan BlockScanT; auto* temp = reinterpret_cast(shmem); - int data[1]; + T data[1]; uint8_t flag = getter(); data[0] = flag; __syncthreads(); - int aggregate; + T aggregate; BlockScanT(*temp).ExclusiveSum(data, data, aggregate); - __syncthreads(); if (flag) { indices[data[0]] = threadIdx.x + start; } if (threadIdx.x == 0) { size = aggregate; } + __syncthreads(); +} + +inline int32_t __device__ __host__ bool256ToIndicesSize() { + return sizeof(typename cub::WarpScan::TempStorage) + + 33 * sizeof(uint16_t); +} + +/// Returns indices of set bits for 256 one byte flags. 'getter8' is +/// invoked for 8 flags at a time, with the ordinal of the 8 byte +/// flags word as argument, so that an index of 1 means flags +/// 8..15. The indices start at 'start' and last index + 1 is +/// returned in 'size'. +template +__device__ inline void +bool256ToIndices(Getter8 getter8, T start, T* indices, T& size, char* smem) { + using Scan = cub::WarpScan; + auto* smem16 = reinterpret_cast(smem); + int32_t group = threadIdx.x / 8; + uint64_t bits = getter8(group) & 0x0101010101010101; + if ((threadIdx.x & 7) == 0) { + smem16[group] = __popcll(bits); + if (threadIdx.x == blockDim.x - 8) { + smem16[32] = smem16[31]; + } + } + __syncthreads(); + if (threadIdx.x < 32) { + auto* temp = reinterpret_cast((smem + 72)); + uint16_t data = smem16[threadIdx.x]; + Scan(*temp).ExclusiveSum(data, data); + smem16[threadIdx.x] = data; + } + __syncthreads(); + int32_t tidInGroup = threadIdx.x & 7; + if (bits & (1UL << (tidInGroup * 8))) { + int32_t base = + smem16[group] + __popcll(bits & lowMask(tidInGroup * 8)); + indices[base] = threadIdx.x + start; + } + if (threadIdx.x == 0) { + size = smem16[31] + smem16[32]; + } + __syncthreads(); } template @@ -65,4 +124,164 @@ __device__ inline void blockSum(Getter getter, void* shmem, T* result) { } } +template < + int32_t kBlockSize, + int32_t kItemsPerThread, + typename Key, + typename Value> +using RadixSort = + typename cub::BlockRadixSort; + +template < + int32_t kBlockSize, + int32_t kItemsPerThread, + typename Key, + typename Value> +inline int32_t __host__ __device__ blockSortSharedSize() { + return sizeof( + typename RadixSort::TempStorage); +} + +template < + int32_t kBlockSize, + int32_t kItemsPerThread, + typename Key, + typename Value, + typename KeyGetter, + typename ValueGetter> +void __device__ blockSort( + KeyGetter keyGetter, + ValueGetter valueGetter, + Key* keyOut, + Value* valueOut, + char* smem) { + using Sort = cub::BlockRadixSort; + + // Per-thread tile items + Key keys[kItemsPerThread]; + Value values[kItemsPerThread]; + + // Our current block's offset + int blockOffset = 0; + + // Load items into a blocked arrangement + for (auto i = 0; i < kItemsPerThread; ++i) { + int32_t idx = blockOffset + i * kBlockSize + threadIdx.x; + values[i] = valueGetter(idx); + keys[i] = keyGetter(idx); + } + + __syncthreads(); + auto* temp_storage = reinterpret_cast(smem); + + Sort(*temp_storage).SortBlockedToStriped(keys, values); + + // Store output in striped fashion + cub::StoreDirectStriped( + threadIdx.x, valueOut + blockOffset, values); + cub::StoreDirectStriped(threadIdx.x, keyOut + blockOffset, keys); + __syncthreads(); +} + +template +int32_t partitionRowsSharedSize(int32_t numPartitions) { + using Scan = cub::BlockScan; + auto scanSize = sizeof(typename Scan::TempStorage) + sizeof(int32_t); + int32_t counterSize = sizeof(int32_t) * numPartitions; + if (counterSize <= scanSize) { + return scanSize; + } + static_assert( + sizeof(typename Scan::TempStorage) >= sizeof(int32_t) * kBlockSize); + return scanSize + counterSize; // - kBlockSize * sizeof(int32_t); +} + +/// Partitions a sequence of indices into runs where the indices +/// belonging to the same partition are contiguous. Indices from 0 to +/// 'numKeys-1' are partitioned into 'partitionedRows', which must +/// have space for 'numKeys' row numbers. The 0-based partition number +/// for row 'i' is given by 'getter(i)'. The row numbers for +/// partition 0 start at 0. The row numbers for partition i start at +/// 'partitionStarts[i-1]'. There must be at least the amount of +/// shared memory given by partitionSharedSize(numPartitions). +/// 'ranks' is a temporary array of 'numKeys' elements. +template +void __device__ partitionRows( + Getter getter, + uint32_t numKeys, + uint32_t numPartitions, + RowNumber* ranks, + RowNumber* partitionStarts, + RowNumber* partitionedRows) { + using Scan = cub::BlockScan; + constexpr int32_t kWarpThreads = 1 << CUB_LOG_WARP_THREADS(0); + auto warp = threadIdx.x / kWarpThreads; + auto lane = cub::LaneId(); + extern __shared__ __align__(16) char smem[]; + auto* counters = reinterpret_cast( + numPartitions <= kBlockSize ? smem + : smem + + sizeof(typename Scan:: + TempStorage) /*- kBlockSize * sizeof(uint32_t)*/); + for (auto i = threadIdx.x; i < numPartitions; i += kBlockSize) { + counters[i] = 0; + } + __syncthreads(); + for (auto start = 0; start < numKeys; start += kBlockSize) { + int32_t warpStart = start + warp * kWarpThreads; + if (start >= numKeys) { + break; + } + uint32_t laneMask = warpStart + kWarpThreads <= numKeys + ? 0xffffffff + : lowMask(numKeys - warpStart); + if (warpStart + lane < numKeys) { + int32_t key = getter(warpStart + lane); + uint32_t mask = __match_any_sync(laneMask, key); + int32_t leader = (kWarpThreads - 1) - __clz(mask); + uint32_t cnt = __popc(mask & lowMask(lane + 1)); + uint32_t base; + if (lane == leader) { + base = atomicAdd(&counters[key], cnt); + } + base = __shfl_sync(laneMask, base, leader); + ranks[warpStart + lane] = base + cnt - 1; + } + } + // Prefix sum the counts. All counters must have their final value. + __syncthreads(); + auto* temp = reinterpret_cast(smem); + int32_t* aggregate = reinterpret_cast(smem); + for (auto start = 0; start < numPartitions; start += kBlockSize) { + int32_t localCount[1]; + localCount[0] = + threadIdx.x + start < numPartitions ? counters[start + threadIdx.x] : 0; + if (threadIdx.x == 0 && start > 0) { + // The sum of the previous round is carried over as start of this. + localCount[0] += *aggregate; + } + Scan(*temp).InclusiveSum(localCount, localCount); + if (start + threadIdx.x < numPartitions) { + partitionStarts[start + threadIdx.x] = localCount[0]; + } + if (threadIdx.x == kBlockSize - 1 && start + kBlockSize < numPartitions) { + *aggregate = localCount[0]; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + if (partitionStarts[numPartitions - 1] != numKeys) { + *(long*)0 = 0; + } + } + // Write the row numbers of the inputs into the rankth position in each + // partition. + for (auto i = threadIdx.x; i < numKeys; i += kBlockSize) { + auto key = getter(i); + auto keyStart = key == 0 ? 0 : partitionStarts[key - 1]; + partitionedRows[keyStart + ranks[i]] = i; + } + __syncthreads(); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/Buffer.h b/velox/experimental/wave/common/Buffer.h index 57451596a94b7..a205e173ffccf 100644 --- a/velox/experimental/wave/common/Buffer.h +++ b/velox/experimental/wave/common/Buffer.h @@ -32,6 +32,8 @@ class GpuArena; /// Buffer free list. class Buffer { public: + virtual ~Buffer() = default; + template T* as() { return reinterpret_cast(ptr_); @@ -71,9 +73,9 @@ class Buffer { return referenceCount_; } - void release(); + virtual void release(); - private: + protected: // Number of WaveBufferPtrs referencing 'this'. std::atomic referenceCount_{0}; @@ -108,4 +110,34 @@ static inline void intrusive_ptr_release(Buffer* buffer) { buffer->release(); } +template +class WaveBufferView : public Buffer { + public: + static WaveBufferPtr create(uint8_t* data, size_t size, Releaser releaser) { + WaveBufferView* view = new WaveBufferView(data, size, releaser); + WaveBufferPtr result(view); + return result; + } + + ~WaveBufferView() override = default; + + void release() override { + if (referenceCount_.fetch_sub(1) == 1) { + // Destructs releaser, which should release the hold on the underlying + // buffer. + delete this; + } + } + + private: + WaveBufferView(uint8_t* data, size_t size, Releaser releaser) + : Buffer(), releaser_(releaser) { + ptr_ = data; + size_ = size; + capacity_ = size; + } + + Releaser const releaser_; +}; + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu index 10e716065a244..dd63f28b82439 100644 --- a/velox/experimental/wave/common/Cuda.cu +++ b/velox/experimental/wave/common/Cuda.cu @@ -16,10 +16,13 @@ #include #include +#include #include "velox/experimental/wave/common/Cuda.h" #include "velox/experimental/wave/common/CudaUtil.cuh" #include "velox/experimental/wave/common/Exception.h" +#include + namespace facebook::velox::wave { void cudaCheck(cudaError_t err, const char* file, int line) { @@ -30,6 +33,16 @@ void cudaCheck(cudaError_t err, const char* file, int line) { fmt::format("Cuda error: {}:{} {}", file, line, cudaGetErrorString(err))); } +void cudaCheckFatal(cudaError_t err, const char* file, int line) { + if (err == cudaSuccess) { + return; + } + auto error = + fmt::format("Cuda error: {}:{} {}", file, line, cudaGetErrorString(err)); + std::cerr << err << std::endl; + exit(1); +} + namespace { class CudaManagedAllocator : public GpuAllocator { public: @@ -208,5 +221,67 @@ float Event::elapsedTime(const Event& start) const { CUDA_CHECK(cudaEventElapsedTime(&ms, start.event_->event, event_->event)); return ms; } +namespace { +struct KernelEntry { + const char* name; + const void* func; +}; + +int32_t numKernelEntries = 0; +KernelEntry kernelEntries[200]; +} // namespace + +bool registerKernel(const char* name, const void* func) { + kernelEntries[numKernelEntries].name = name; + kernelEntries[numKernelEntries].func = func; + ++numKernelEntries; + if (numKernelEntries >= sizeof(kernelEntries) / sizeof(kernelEntries[0])) { + LOG(ERROR) << "Reserve more space in kernelEntries"; + exit(1); + } + return true; +} + +KernelInfo kernelInfo(const const void* func) { + cudaFuncAttributes attrs; + CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, func)); + KernelInfo info; + info.numRegs = attrs.numRegs; + info.maxThreadsPerBlock = attrs.maxThreadsPerBlock; + info.sharedMemory = attrs.sharedSizeBytes; + int max; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max, func, 256, 0); + info.maxOccupancy0 = max; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max, func, 256, 16); + info.maxOccupancy16 = max; + + return info; +} + +std::string KernelInfo::toString() const { + std::stringstream out; + out << "NumRegs=" << numRegs << " maxThreadsPerBlock= " << maxThreadsPerBlock + << " sharedMemory=" << sharedMemory + << " occupancy 256, 0=" << maxOccupancy0 + << " occupancy 256,16=" << maxOccupancy16; + return out.str(); +} + +KernelInfo getRegisteredKernelInfo(const char* name) { + for (auto i = 0; i < numKernelEntries; ++i) { + if (strcmp(name, kernelEntries[i].name) == 0) { + return kernelInfo(kernelEntries[i].func); + } + } + return KernelInfo(); +} + +void printKernels() { + for (auto i = 0; i < numKernelEntries; ++i) { + std::cout << kernelEntries[i].name << " - " + << getRegisteredKernelInfo(kernelEntries[i].name).toString() + << std::endl; + } +} } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/Cuda.h b/velox/experimental/wave/common/Cuda.h index b8e11806013d1..cf7900ec2f50a 100644 --- a/velox/experimental/wave/common/Cuda.h +++ b/velox/experimental/wave/common/Cuda.h @@ -18,6 +18,8 @@ #include #include +#include +#include /// Contains wrappers for common Cuda objects. Wave does not directly /// include Cuda headers because of interference with BitUtils.h and /// SimdUtils.h. @@ -183,4 +185,23 @@ GpuAllocator::UniquePtr GpuAllocator::allocate(size_t n) { return UniquePtr(ptr, Deleter(this, bytes)); } +/// Info on kernel occupancy limits. +struct KernelInfo { + int32_t numRegs{0}; + int32_t maxThreadsPerBlock; + int32_t sharedMemory{0}; + int32_t maxOccupancy0{0}; + int32_t maxOccupancy16{0}; + + std::string toString() const; +}; + +KernelInfo getRegisteredKernelInfo(const char* name); + +KernelInfo kernelInfo(const void* func); + +std::unordered_map& kernelRegistry(); +/// Prints summary of registered kernels. +void printKernels(); + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/CudaUtil.cuh b/velox/experimental/wave/common/CudaUtil.cuh index 07549cfc3f32f..120ce1ec90371 100644 --- a/velox/experimental/wave/common/CudaUtil.cuh +++ b/velox/experimental/wave/common/CudaUtil.cuh @@ -25,13 +25,25 @@ namespace facebook::velox::wave { void cudaCheck(cudaError_t err, const char* file, int line); +void cudaCheckFatal(cudaError_t err, const char* file, int line); + #define CUDA_CHECK(e) ::facebook::velox::wave::cudaCheck(e, __FILE__, __LINE__) +#ifndef CUDA_CHECK_FATAL +#define CUDA_CHECK_FATAL(e) \ + ::facebook::velox::wave::cudaCheckFatal(e, __FILE__, __LINE__) +#endif + template __host__ __device__ constexpr inline T roundUp(T value, U factor) { return (value + (factor - 1)) / factor * factor; } +template +T __device__ __host__ lowMask(int32_t bits) { + return (static_cast(1) << bits) - 1; +} + __device__ __host__ inline int memcmp(const void* lhs, const void* rhs, size_t n) { auto* a = reinterpret_cast(lhs); @@ -44,7 +56,20 @@ memcmp(const void* lhs, const void* rhs, size_t n) { return 0; } +inline uint32_t __device__ deviceScale32(uint32_t n, uint32_t scale) { + return (static_cast(static_cast(n)) * scale) >> 32; +} + struct StreamImpl { cudaStream_t stream; }; + +bool registerKernel(const char* name, const void* func); + +#define REGISTER_KERNEL(name, func) \ + namespace { \ + static bool func##_reg = \ + registerKernel(name, reinterpret_cast(func)); \ + } + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/FreeSet.cuh b/velox/experimental/wave/common/FreeSet.cuh new file mode 100644 index 0000000000000..c6e7f2bfc3cf1 --- /dev/null +++ b/velox/experimental/wave/common/FreeSet.cuh @@ -0,0 +1,99 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook::velox::wave { + +template +class FreeSet { + public: + static constexpr uint32_t kEmpty = ~0; + static constexpr int32_t kBitSizeMask = (kSize / 64) - 1; + static constexpr int32_t kSizeMask = kSize - 1; + + void __device__ clear() { + for (auto i = threadIdx.x; i < kSize; i += blockDim.x) { + if (i < sizeof(bits_) / sizeof(bits_[0])) { + bits_[i] = 0; + } + items_[i] = kEmpty; + } + } + + // Adds an item. Returns true if succeededs. + bool __device__ put(T item) { + if (full_) { + return false; + } + auto tid = threadIdx.x + blockDim.x * blockIdx.x; + auto bitIdx = tid & kBitSizeMask; + for (auto count = 0; count <= kBitSizeMask; ++count) { + auto word = ~bits_[bitIdx]; + while (word) { + auto bit = __ffsll(word); + --bit; + if (kEmpty == atomicCAS(&items_[bitIdx * 64 + bit], kEmpty, item)) { + atomicOr(&bits_[bitIdx], 1UL << bit); + if (empty_) { + atomicExch(&empty_, 0); + } + return true; + } + word &= word - 1; + } + bitIdx = bitIdx + 1 & kBitSizeMask; + } + atomicExch(&full_, 1); + return false; + } + + T __device__ get() { + if (empty_) { + return kEmpty; + } + + auto tid = threadIdx.x + blockDim.x * blockIdx.x; + auto bitIdx = tid & kBitSizeMask; + for (auto count = 0; count <= kBitSizeMask; ++count) { + auto word = bits_[bitIdx]; + while (word) { + auto bit = __ffsll(word); + --bit; + T item = atomicExch(&items_[bitIdx * 64 + bit], kEmpty); + if (item != kEmpty) { + atomicAnd(&bits_[bitIdx], ~(1UL << bit)); + if (full_) { + atomicExch(&full_, 0); + } + return item; + } + word &= word - 1; + } + bitIdx = bitIdx + 1 & kBitSizeMask; + } + atomicExch(&empty_, true); + return kEmpty; + } + + int32_t full_{0}; + int32_t empty_{1}; + unsigned long long bits_[kBitSizeMask + 1]; + T items_[kSize]; +}; +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/GpuArena.cpp b/velox/experimental/wave/common/GpuArena.cpp index 2a41f8859e0d6..2d2f4610036ce 100644 --- a/velox/experimental/wave/common/GpuArena.cpp +++ b/velox/experimental/wave/common/GpuArena.cpp @@ -272,7 +272,9 @@ std::string GpuSlab::toString() const { } GpuArena::Buffers::Buffers() { - memset(&buffers[0], 0, sizeof(buffers)); + for (auto i = 0; i < sizeof(buffers) / sizeof(buffers[0]); ++i) { + new (&buffers[i]) Buffer(); + } } GpuArena::GpuArena(uint64_t singleArenaCapacity, GpuAllocator* allocator) @@ -297,6 +299,7 @@ WaveBufferPtr GpuArena::getBuffer(void* ptr, size_t size) { result = firstFreeBuffer_; } firstFreeBuffer_ = reinterpret_cast(result->ptr_); + new (result) Buffer(); result->arena_ = this; result->ptr_ = ptr; result->size_ = size; @@ -325,10 +328,9 @@ WaveBufferPtr GpuArena::allocateBytes(uint64_t bytes) { // If first allocation fails we create a new GpuSlab for another attempt. If // it ever fails again then it means requested bytes is larger than a single // GpuSlab's capacity. No further attempts will happen. + auto arenaBytes = std::max(singleArenaCapacity_, bytes); auto newArena = std::make_shared( - allocator_->allocate(singleArenaCapacity_), - singleArenaCapacity_, - allocator_); + allocator_->allocate(arenaBytes), arenaBytes, allocator_); arenas_.emplace(reinterpret_cast(newArena->address()), newArena); currentArena_ = newArena; result = currentArena_->allocate(bytes); diff --git a/velox/experimental/wave/common/GpuArena.h b/velox/experimental/wave/common/GpuArena.h index 8cb39948139aa..393899d9338cd 100644 --- a/velox/experimental/wave/common/GpuArena.h +++ b/velox/experimental/wave/common/GpuArena.h @@ -124,7 +124,7 @@ class GpuArena { WaveBufferPtr allocateBytes(uint64_t bytes); template - WaveBufferPtr allocate(int32_t items) { + WaveBufferPtr allocate(uint64_t items) { static_assert(std::is_trivially_destructible_v); return allocateBytes(sizeof(T) * items); } diff --git a/velox/experimental/wave/common/Hash.h b/velox/experimental/wave/common/Hash.h index ca3e51e95e265..c91f25407bdca 100644 --- a/velox/experimental/wave/common/Hash.h +++ b/velox/experimental/wave/common/Hash.h @@ -93,6 +93,17 @@ __device__ __host__ inline uint32_t twang32From64(uint64_t key) { return static_cast(key); } +__device__ inline uint64_t hashMix(const uint64_t upper, const uint64_t lower) { + // Murmur-inspired hashing. + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + uint64_t a = (lower ^ upper) * kMul; + a ^= (a >> 47); + uint64_t b = (upper ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + template struct IntHasher32 { __device__ __host__ uint32_t operator()(T val) const { diff --git a/velox/experimental/wave/common/HashTable.cuh b/velox/experimental/wave/common/HashTable.cuh new file mode 100644 index 0000000000000..9ece4aa0cfbce --- /dev/null +++ b/velox/experimental/wave/common/HashTable.cuh @@ -0,0 +1,368 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include "velox/experimental/wave/common/CudaUtil.cuh" +#include "velox/experimental/wave/common/FreeSet.cuh" +#include "velox/experimental/wave/common/Hash.h" +#include "velox/experimental/wave/common/HashTable.h" + +namespace facebook::velox::wave { + +#define GPF() *(long*)0 = 0 + +template +inline __device__ cuda::atomic* asDeviceAtomic( + U* ptr) { + return reinterpret_cast*>(ptr); +} + +template +inline bool __device__ atomicTryLock(T* lock) { + return 0 == + asDeviceAtomic(lock)->exchange(1, cuda::memory_order_consume); +} + +template +inline void __device__ atomicUnlock(T* lock) { + asDeviceAtomic(lock)->store(0, cuda::memory_order_release); +} + +/// Allocator subclass that defines device member functions. +struct RowAllocator : public HashPartitionAllocator { + template + T* __device__ allocateRow() { + auto fromFree = getFromFree(); + if (fromFree != kEmpty) { + ++numFromFree; + return reinterpret_cast(base + fromFree); + } + auto offset = atomicAdd(&rowOffset, rowSize); + + if (offset + rowSize < cub::ThreadLoad(&stringOffset)) { + if (!inRange(base + offset)) { + GPF(); + } + return reinterpret_cast(base + offset); + } + return nullptr; + } + + uint32_t __device__ getFromFree() { + uint32_t item = reinterpret_cast*>(freeSet)->get(); + if (item != kEmpty) { + ++numFromFree; + } + return item; + } + + void __device__ freeRow(void* row) { + if (!inRange(row)) { + GPF(); + } + uint32_t offset = reinterpret_cast(row) - base; + numFull += reinterpret_cast*>(freeSet)->put( + offset) == false; + } + + template + T* __device__ allocate(int32_t cnt) { + uint32_t size = sizeof(T) * cnt; + auto offset = atomicSub(&stringOffset, size); + if (offset - size > cub::ThreadLoad(&rowOffset)) { + if (!inRange(base + offset - size)) { + GPF(); + } + return reinterpret_cast(base + offset - size); + } + return nullptr; + } + + template + bool __device__ inRange(T ptr) { + return reinterpret_cast(ptr) >= base && + reinterpret_cast(ptr) < base + capacity; + } +}; + +inline uint8_t __device__ hashTag(uint64_t h) { + return 0x80 | (h >> 32); +} + +struct GpuBucket : public GpuBucketMembers { + template + inline RowType* __device__ load(int32_t idx) const { + uint64_t uptr = reinterpret_cast(&data)[idx]; + if (uptr == 0) { + return nullptr; + } + uptr |= static_cast(data[idx + 8]) << 32; + return reinterpret_cast(uptr); + } + + template + inline RowType* __device__ loadConsume(int32_t idx) { + uint64_t uptr = + asDeviceAtomic(&data)[idx].load(cuda::memory_order_consume); + if (uptr == 0) { + return nullptr; + } + uptr |= static_cast(data[idx + 8]) << 32; + return reinterpret_cast(uptr); + } + + template + inline RowType* __device__ loadWithWait(int32_t idx) { + RowType* hit; + do { + // It could be somebody inserted the tag but did not fill in the + // pointer. The pointer is coming in a few clocks. + hit = loadConsume(idx); + } while (!hit); + return hit; + } + + inline void __device__ store(int32_t idx, void* ptr) { + auto uptr = reinterpret_cast(ptr); + data[8 + idx] = uptr >> 32; + // The high part must be seen if the low part is seen. + asDeviceAtomic(&data)[idx].store( + uptr, cuda::memory_order_release); + } + + bool __device__ addNewTag(uint8_t tag, uint32_t oldTags, uint8_t tagShift) { + uint32_t newTags = oldTags | ((static_cast(tag) << tagShift)); + return (oldTags == atomicCAS(&tags, oldTags, newTags)); + } +}; + +/// Shared memory state for an updating probe. +struct ProbeShared { + int32_t* inputRetries; + int32_t* outputRetries; + uint32_t numKernelRetries; + uint32_t numHostRetries; + int32_t blockBase; + int32_t blockEnd; + int32_t numRounds; + int32_t toDo; + int32_t done; + int32_t numUpdated; + int32_t numTried; + + /// Initializes a probe. Sets outputRetries and clears inputRetries and other + /// state. + void __device__ init(HashProbe* probe, int32_t base) { + inputRetries = nullptr; + outputRetries = probe->kernelRetries1; + numKernelRetries = 0; + numHostRetries = 0; + blockBase = base; + toDo = 0; + done = 0; + numRounds = 0; + } + + // Resets retrry count and swaps input and output retries. + void __device__ nextRound(HashProbe* probe) { + numKernelRetries = 0; + if (!inputRetries) { + // This is after the initial round where there are no input retries. + inputRetries = outputRetries; + outputRetries = probe->kernelRetries2; + } else { + // swap input and output retries. + auto temp = outputRetries; + outputRetries = inputRetries; + inputRetries = temp; + } + } +}; + +class GpuHashTable : public GpuHashTableBase { + public: + static constexpr int32_t kExclusive = 1; + + static int32_t updatingProbeSharedSize() { + return sizeof(ProbeShared); + } + + template + void __device__ readOnlyProbe(HashProbe* probe, Ops ops) { + int32_t blockBase = ops.blockBase(probe); + int32_t end = ops.numRowsInBlock(probe) + blockBase; + for (auto i = blockBase + threadIdx.x; i < end; i += blockDim.x) { + auto h = ops.hash(i, probe); + uint32_t tagWord = hashTag(h); + tagWord |= tagWord << 8; + tagWord = tagWord | tagWord << 16; + auto bucketIdx = h & sizeMask; + for (;;) { + GpuBucket* bucket = buckets + bucketIdx; + auto tags = bucket->tags; + auto hits = __vcmpeq4(tags, tagWord) & 0x01010101; + while (hits) { + auto hitIdx = (__ffs(hits) - 1) / 8; + auto* hit = bucket->load(hitIdx); + if (ops.compare(this, hit, i, probe)) { + ops.hit(i, probe, hit); + goto done; + } + hits = hits & (hits - 1); + } + if (__vcmpeq4(tags, 0)) { + ops.miss(i, probe); + break; + } + bucketIdx = (bucketIdx + 1) & sizeMask; + } + done:; + } + } + + template + void __device__ updatingProbe(HashProbe* probe, Ops ops) { + extern __shared__ __align__(16) char smem[]; + auto* sharedState = reinterpret_cast(smem); + if (threadIdx.x == 0) { + sharedState->init(probe, ops.blockBase(probe)); + } + __syncthreads(); + auto lane = cub::LaneId(); + constexpr int32_t kWarpThreads = 1 << CUB_LOG_WARP_THREADS(0); + auto warp = threadIdx.x / kWarpThreads; + int32_t end = ops.numRowsInBlock(probe) + sharedState->blockBase; + for (auto i = threadIdx.x + sharedState->blockBase; i < end; + i += blockDim.x) { + auto start = i & ~(kWarpThreads - 1); + uint32_t laneMask = + start + kWarpThreads <= end ? ~0 : lowMask(end - start); + auto h = ops.hash(i, probe); + uint32_t tagWord = hashTag(h); + tagWord |= tagWord << 8; + tagWord = tagWord | tagWord << 16; + auto bucketIdx = h & sizeMask; + uint32_t misses = 0; + RowType* hit = nullptr; + RowType* toInsert = nullptr; + int32_t hitIdx; + GpuBucket* bucket; + uint32_t tags; + for (;;) { + bucket = buckets + bucketIdx; + reprobe: + tags = asDeviceAtomic(&bucket->tags) + ->load(cuda::memory_order_consume); + auto hits = __vcmpeq4(tags, tagWord) & 0x01010101; + while (hits) { + hitIdx = (__ffs(hits) - 1) / 8; + auto candidate = bucket->loadWithWait(hitIdx); + if (ops.compare(this, candidate, i, probe)) { + if (toInsert) { + freeInsertable(toInsert, h); + } + hit = candidate; + break; + } + hits = hits & (hits - 1); + } + if (hit) { + break; + } + misses = __vcmpeq4(tags, 0); + if (misses) { + auto success = ops.insert( + this, + partitionIdx(h), + bucket, + misses, + tags, + tagWord, + i, + probe, + toInsert); + if (success == ProbeState::kRetry) { + goto reprobe; + } + if (success == ProbeState::kNeedSpace) { + addHostRetry(sharedState, i, probe); + } + hit = toInsert; + break; + } + bucketIdx = (bucketIdx + 1) & sizeMask; + } + // Every lane has a hit, or a nullptr if out of space. + uint32_t peers = + __match_any_sync(laneMask, reinterpret_cast(hit)); + if (hit) { + int32_t leader = (kWarpThreads - 1) - __clz(peers); + RowType* writable = nullptr; + if (lane == leader) { + writable = ops.getExclusive(this, bucket, hit, hitIdx, warp); + } + auto toUpdate = peers; + ProbeState success = ProbeState::kDone; + while (toUpdate) { + auto peer = __ffs(toUpdate) - 1; + auto idxToUpdate = __shfl_sync(peers, i, peer); + if (lane == leader) { + if (success == ProbeState::kDone) { + success = ops.update(this, bucket, writable, idxToUpdate, probe); + } + if (success == ProbeState::kNeedSpace) { + addHostRetry(sharedState, idxToUpdate, probe); + } + if (success != ProbeState::kDone) { + printf(""); + } + } + toUpdate &= toUpdate - 1; + } + if (lane == leader) { + ops.writeDone(writable); + } + } else { + printf(""); + } + } + } + + template + void __device__ freeInsertable(RowType*& row, uint64_t h) { + allocators[partitionIdx(h)].freeRow(row); + row = nullptr; + } + + int32_t __device__ partitionIdx(uint64_t h) const { + return (h & partitionMask) >> partitionShift; + } + + private: + static void __device__ + addHostRetry(ProbeShared* shared, int32_t i, HashProbe* probe) { + probe->hostRetries + [shared->blockBase + atomicAdd(&shared->numHostRetries, 1)] = i; + } +}; +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/HashTable.h b/velox/experimental/wave/common/HashTable.h new file mode 100644 index 0000000000000..54dec795bb3d2 --- /dev/null +++ b/velox/experimental/wave/common/HashTable.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/// Structs for tagged GPU hash table. Can be inclued in both Velox .cpp and +/// .cu. +namespace facebook::velox::wave { + +/// A 32 byte tagged bucket with 4 tags, 4 flag bytes and 4 6-byte +/// pointers. Fits in one 32 byte GPU cache sector. +struct GpuBucketMembers { + uint32_t tags; + uint32_t flags; + uint16_t data[12]; + + template + T* testingLoad(int32_t idx) { + auto uptr = static_cast(data[8 + idx]) << 32; + uptr |= reinterpret_cast(data)[idx]; + return reinterpret_cast(uptr); + } +}; + +template +class FreeSetBase { + int32_t full_{0}; + int32_t empty_{1}; + unsigned long long bits_[kSize / 64] = {}; + T items_[kSize] = {}; +}; + +/// A device arena for device side allocation. +struct HashPartitionAllocator { + static constexpr uint32_t kEmpty = ~0; + + HashPartitionAllocator( + char* data, + uint32_t size, + uint32_t rowSize, + void* freeSet) + : rowSize(rowSize), + base(reinterpret_cast(data)), + capacity(size), + stringOffset(capacity), + freeSet(freeSet) {} + + const int32_t rowSize{0}; + const uint64_t base{0}; + uint32_t rowOffset{0}; + const uint32_t capacity{0}; + uint32_t stringOffset{0}; + void* freeSet{nullptr}; + int32_t numFromFree{0}; + int32_t numFull{0}; +}; + +/// Implementation of HashPartitionAllocator, defined in .cuh. +struct RowAllocator; + +enum class ProbeState : uint8_t { kDone, kMoreValues, kNeedSpace, kRetry }; + +/// Operands for one TB of hash probe. +struct HashProbe { + /// The number of input rows processed by each thread of a TB. The base index + /// for a block in the arrays in 'this' is 'numRowsPerThread * blockDim.x * + /// blockIdx.x' + int32_t numRowsPerThread{1}; + + /// Count of probe keys for each TB. Subscript is blockIdx.x. + int32_t* numRows; + + /// Data for probe keys. To be interpreted by Ops of the probe, no + /// fixed format. + void* keys; + + /// Hash numbers for probe keys. + uint64_t* hashes; + + /// List of input rows to retry in kernel. Sized to one per row of + /// input. Used inside kernel, not meaningful after return. Sample + /// use case is another warp updating the same row. + int32_t* kernelRetries1; + int32_t* kernelRetries2; + + /// List of input rows to retry after host updated state. Sized to + /// one per row of input. The reason for a host side retry is + /// needing more space. The host will decide to allocate/spill/error + /// out. + int32_t* hostRetries; + + /// Count of valid items in 'hostRetries'. The subscript is blockIdx.x. + int32_t* numHostRetries; + + /// Space in 'hits' and 'hitRows'. Should be a multiple of probe block width. + int32_t maxHits{0}; + + /// Row numbers for hits. Indices into 'hashes'. + int32_t* hitRows{nullptr}; + + // Optional payload rows hitting from a probe. + void** hits{nullptr}; +}; + +struct GpuBucket; + +struct GpuHashTableBase { + /// Bucket array. Size is 'sizeMask + 1'. + GpuBucket* buckets{nullptr}; + + // Mask to extract index into 'buckets' from a hash number. a + // sizemask of 63 means 64 buckets, which is up to 256 entries. + uint32_t sizeMask; + + // Translates a hash number to a partition number '(hash & + // partitionMask) >> partitionShift' is a partition number used as + // a physical partition of the table. Used as index into 'allocators'. + uint32_t partitionMask{0}; + uint8_t partitionShift{0}; + + /// A RowAllocator for each partition. + RowAllocator* allocators; +}; + +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/BlockTest.cpp b/velox/experimental/wave/common/tests/BlockTest.cpp index 012010233e71a..b5b543c450330 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cpp +++ b/velox/experimental/wave/common/tests/BlockTest.cpp @@ -27,6 +27,15 @@ using namespace facebook::velox; using namespace facebook::velox::wave; +constexpr int32_t kNumPartitionBlocks = 100; +struct PartitionRun { + uint16_t* keys[kNumPartitionBlocks]; + int32_t numRows[kNumPartitionBlocks]; + int32_t* ranks[kNumPartitionBlocks]; + int32_t* partitionStarts[kNumPartitionBlocks]; + int32_t* partitionedRows[kNumPartitionBlocks]; +}; + class BlockTest : public testing::Test { protected: void SetUp() override { @@ -39,77 +48,301 @@ class BlockTest : public testing::Test { void prefetch(Stream& stream, WaveBufferPtr buffer) { stream.prefetch(device_, buffer->as(), buffer->capacity()); } + void testBoolToIndices(bool use256) { + /// We make a set of 256 flags and corresponding 256 indices of true flags. + constexpr int32_t kNumBlocks = 20480; + constexpr int32_t kBlockSize = 256; + constexpr int32_t kNumFlags = kBlockSize * kNumBlocks; + auto flagsBuffer = arena_->allocate(kNumFlags); + auto indicesBuffer = arena_->allocate(kNumFlags); + auto sizesBuffer = arena_->allocate(kNumBlocks); + BlockTestStream stream; + + std::vector referenceIndices(kNumFlags); + std::vector referenceSizes(kNumBlocks); + uint8_t* flags = flagsBuffer->as(); + for (auto i = 0ul; i < kNumFlags; ++i) { + if ((i >> 8) % 17 == 0) { + flags[i] = 0; + } else if ((i >> 8) % 23 == 0) { + flags[i] = 1; + } else { + flags[i] = (i * 1121) % 73 > 50; + } + } + for (auto b = 0; b < kNumBlocks; ++b) { + auto start = b * kBlockSize; + int32_t counter = start; + for (auto i = 0; i < kBlockSize; ++i) { + if (flags[start + i]) { + referenceIndices[counter++] = start + i; + } + } + referenceSizes[b] = counter - start; + } + + prefetch(stream, flagsBuffer); + prefetch(stream, indicesBuffer); + prefetch(stream, sizesBuffer); + stream.wait(); + auto indicesPointers = arena_->allocate(kNumBlocks); + auto flagsPointers = arena_->allocate(kNumBlocks); + for (auto i = 0; i < kNumBlocks; ++i) { + flagsPointers->as()[i] = flags + (i * kBlockSize); + indicesPointers->as()[i] = + indicesBuffer->as() + (i * kBlockSize); + } + + prefetch(stream, flagsBuffer); + prefetch(stream, indicesBuffer); + prefetch(stream, sizesBuffer); + stream.wait(); + auto startMicros = getCurrentTimeMicro(); + if (use256) { + stream.testBool256ToIndices( + kNumBlocks, + flagsPointers->as(), + indicesPointers->as(), + sizesBuffer->as()); + + } else { + stream.testBoolToIndices( + kNumBlocks, + flagsPointers->as(), + indicesPointers->as(), + sizesBuffer->as()); + } + stream.wait(); + auto elapsed = getCurrentTimeMicro() - startMicros; + for (auto b = 0; b < kNumBlocks; ++b) { + auto* reference = referenceIndices.data() + b * kBlockSize; + auto* actual = indicesBuffer->as() + b * kBlockSize; + auto* referenceSizesData = referenceSizes.data(); + auto* actualSizes = sizesBuffer->as(); + ASSERT_EQ( + 0, ::memcmp(reference, actual, referenceSizes[b] * sizeof(int32_t))); + ASSERT_EQ(referenceSizesData[b], actualSizes[b]); + } + std::cout << "Flags " << (use256 ? "256" : "") << " to indices: " << elapsed + << "us, " << kNumFlags / static_cast(elapsed) << " Mrows/s" + << std::endl; + + auto temp = arena_->allocate( + BlockTestStream::boolToIndicesSize() * kNumBlocks); + prefetch(stream, temp); + prefetch(stream, flagsBuffer); + prefetch(stream, indicesBuffer); + prefetch(stream, sizesBuffer); + stream.wait(); + + startMicros = getCurrentTimeMicro(); + if (use256) { + stream.testBool256ToIndicesNoShared( + kNumBlocks, + flagsPointers->as(), + indicesPointers->as(), + sizesBuffer->as(), + temp->as()); + } else { + stream.testBoolToIndicesNoShared( + kNumBlocks, + flagsPointers->as(), + indicesPointers->as(), + sizesBuffer->as(), + temp->as()); + } + stream.wait(); + elapsed = getCurrentTimeMicro() - startMicros; + std::cout << "Flags " << (use256 ? "256" : "") + << " to indices: " << " to indices no smem: " << elapsed << "us, " + << kNumFlags / static_cast(elapsed) << " Mrows/s" + << std::endl; + } + + void makePartitionRun( + int32_t numRows, + int32_t numPartitions, + PartitionRun*& run, + WaveBufferPtr& buffer) { + auto rowsRounded = bits::roundUp(numRows, 8); + auto partitionsRounded = bits::roundUp(numPartitions, 8); + int64_t bytes = sizeof(PartitionRun) + + kNumPartitionBlocks * + (rowsRounded * sizeof(int32_t) * 4 + + partitionsRounded * sizeof(int32_t)); + if (!buffer || buffer->capacity() < bytes) { + buffer = arena_->allocate(bytes); + } + run = buffer->as(); + auto chars = buffer->as() + sizeof(PartitionRun); + for (auto block = 0; block < kNumPartitionBlocks; ++block) { + run->keys[block] = reinterpret_cast(chars); + run->numRows[block] = numRows; + chars += rowsRounded * sizeof(uint16_t); + run->partitionStarts[block] = reinterpret_cast(chars); + chars += numPartitions * sizeof(int32_t); + run->ranks[block] = reinterpret_cast(chars); + chars += sizeof(int32_t) * numRows; + run->partitionedRows[block] = reinterpret_cast(chars); + chars += sizeof(int32_t) * numRows; + for (auto i = 0; i < numRows; ++i) { + run->keys[block][i] = (block + i * 2017) % numPartitions; + } + } + VELOX_CHECK_LE(chars - buffer->as(), bytes); + } + void checkPartitionRun(const PartitionRun& run, int32_t numPartitions) { + // Check that every row is once in its proper partition. + for (auto block = 0; block < kNumPartitionBlocks; ++block) { + std::vector flags(run.numRows[block], false); + for (auto part = 0; part < numPartitions; ++part) { + for (auto i = (part == 0 ? 0 : run.partitionStarts[block][part - 1]); + i < run.partitionStarts[block][part]; + ++i) { + auto row = run.partitionedRows[block][i]; + EXPECT_LT(row, run.numRows[block]); + EXPECT_FALSE(flags[row]); + EXPECT_EQ(part, run.keys[block][row]); + flags[row] = true; + } + } + // Expect that all flags are set. + for (auto i = 0; i < run.numRows[block]; ++i) { + EXPECT_TRUE(flags[i]); + } + } + } Device* device_; GpuAllocator* allocator_; std::unique_ptr arena_; }; TEST_F(BlockTest, boolToIndices) { - /// We make a set of 256 flags and corresponding 256 indices of true flags. - constexpr int32_t kNumBlocks = 20480; - constexpr int32_t kBlockSize = 256; - constexpr int32_t kNumFlags = kBlockSize * kNumBlocks; - auto flagsBuffer = arena_->allocate(kNumFlags); - auto indicesBuffer = arena_->allocate(kNumFlags); - auto sizesBuffer = arena_->allocate(kNumBlocks); - auto timesBuffer = arena_->allocate(kNumBlocks); + testBoolToIndices(false); + testBoolToIndices(true); +} + +TEST_F(BlockTest, shortRadixSort) { + // We make a set of 8K uint16_t keys and uint16_t values. + constexpr int32_t kNumBlocks = 1024; + constexpr int32_t kBlockSize = 1024; + constexpr int32_t kValuesPerThread = 8; + constexpr int32_t kValuesPerBlock = kBlockSize * kValuesPerThread; + constexpr int32_t kNumValues = kBlockSize * kNumBlocks * kValuesPerThread; + auto keysBuffer = arena_->allocate(kNumValues); + auto valuesBuffer = arena_->allocate(kNumValues); BlockTestStream stream; - std::vector referenceIndices(kNumFlags); - std::vector referenceSizes(kNumBlocks); - uint8_t* flags = flagsBuffer->as(); - for (auto i = 0ul; i < kNumFlags; ++i) { - if ((i >> 8) % 17 == 0) { - flags[i] = 0; - } else if ((i >> 8) % 23 == 0) { - flags[i] = 1; - } else { - flags[i] = (i * 1121) % 73 > 50; - } + std::vector referenceKeys(kNumValues); + std::vector referenceValues(kNumValues); + uint16_t* keys = keysBuffer->as(); + uint16_t* values = valuesBuffer->as(); + for (auto i = 0; i < kNumValues; ++i) { + keys[i] = i * 2017; + values[i] = i; } + for (auto b = 0; b < kNumBlocks; ++b) { - auto start = b * kBlockSize; - int32_t counter = start; - for (auto i = 0; i < kBlockSize; ++i) { - if (flags[start + i]) { - referenceIndices[counter++] = start + i; - } + auto start = b * kValuesPerBlock; + std::vector indices(kValuesPerBlock); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&](auto left, auto right) { + return keys[start + left] < keys[start + right]; + }); + for (auto i = 0; i < kValuesPerBlock; ++i) { + referenceValues[start + i] = values[start + indices[i]]; } - referenceSizes[b] = counter - start; } - prefetch(stream, flagsBuffer); - prefetch(stream, indicesBuffer); - prefetch(stream, sizesBuffer); + prefetch(stream, valuesBuffer); + prefetch(stream, keysBuffer); - auto indicesPointers = arena_->allocate(kNumBlocks); - auto flagsPointers = arena_->allocate(kNumBlocks); + auto keysPointers = arena_->allocate(kNumBlocks); + auto valuesPointers = arena_->allocate(kNumBlocks); for (auto i = 0; i < kNumBlocks; ++i) { - flagsPointers->as()[i] = flags + (i * kBlockSize); - indicesPointers->as()[i] = - indicesBuffer->as() + (i * kBlockSize); + keysPointers->as()[i] = keys + (i * kValuesPerBlock); + valuesPointers->as()[i] = + valuesBuffer->as() + (i * kValuesPerBlock); } - + auto keySegments = keysPointers->as(); + auto valueSegments = valuesPointers->as(); + prefetch(stream, keysPointers); + prefetch(stream, valuesPointers); + stream.wait(); auto startMicros = getCurrentTimeMicro(); - stream.testBoolToIndices( - kNumBlocks, - flagsPointers->as(), - indicesPointers->as(), - sizesBuffer->as(), - timesBuffer->as()); + stream.testSort16(kNumBlocks, keySegments, valueSegments); stream.wait(); auto elapsed = getCurrentTimeMicro() - startMicros; for (auto b = 0; b < kNumBlocks; ++b) { ASSERT_EQ( 0, ::memcmp( - referenceIndices.data() + b * kBlockSize, - indicesBuffer->as() + b * kBlockSize, - referenceSizes[b] * sizeof(int32_t))); - ASSERT_EQ(referenceSizes[b], sizesBuffer->as()[b]); + referenceValues.data() + b * kValuesPerBlock, + valueSegments[b], + kValuesPerBlock * sizeof(uint16_t))); + } + std::cout << "sort16: " << elapsed << "us, " + << kNumValues / static_cast(elapsed) << " Mrows/s" + << std::endl; + + // Reset the test values for second test. + for (auto i = 0; i < kNumValues; ++i) { + keys[i] = i * 2017; + values[i] = i; } - std::cout << "Flags to indices: " << elapsed << "us, " - << kNumFlags / static_cast(elapsed) << " Mrows/s" + auto temp = + arena_->allocate(kNumBlocks * BlockTestStream::sort16SharedSize()); + prefetch(stream, temp); + prefetch(stream, valuesBuffer); + prefetch(stream, keysBuffer); + prefetch(stream, keysPointers); + prefetch(stream, valuesPointers); + stream.wait(); + startMicros = getCurrentTimeMicro(); + stream.testSort16NoShared( + kNumBlocks, keySegments, valueSegments, temp->as()); + stream.wait(); + elapsed = getCurrentTimeMicro() - startMicros; + std::cout << "sort16 no shared: " << elapsed << "us, " + << kNumValues / static_cast(elapsed) << " Mrows/s" << std::endl; } + +TEST_F(BlockTest, partition) { + // We make severl sets of keys and temp and result buffers. These + // are in unified memory. We run the partition for all and check the + // outcome on the host. We run at several different partition counts + // and batch sizes. All experiments are submitted as kNumPartitionBlocks + // concurrent thread blocks of 256 threads. + BlockTestStream stream; + std::vector partitionCounts = {1, 2, 32, 333, 1000, 8000}; + std::vector runSizes = {100, 1000, 10000, 30000}; + WaveBufferPtr buffer; + PartitionRun* run; + for (auto parts : partitionCounts) { + for (auto rows : runSizes) { + makePartitionRun(rows, parts, run, buffer); + prefetch(stream, buffer); + auto startMicros = getCurrentTimeMicro(); + stream.partitionShorts( + kNumPartitionBlocks, + run->keys, + run->numRows, + parts, + run->ranks, + run->partitionStarts, + run->partitionedRows); + stream.wait(); + auto time = getCurrentTimeMicro() - startMicros; + std::cout << fmt::format( + "Partition {} batch={} fanout={} rate={} Mrows/s", + kNumPartitionBlocks, + rows, + parts, + kNumPartitionBlocks * static_cast(rows) / time) + << std::endl; + checkPartitionRun(*run, parts); + } + } +} diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu index 6b32b9880f125..695b075b9a2bc 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cu +++ b/velox/experimental/wave/common/tests/BlockTest.cu @@ -1,20 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "velox/experimental/wave/common/Block.cuh" #include "velox/experimental/wave/common/CudaUtil.cuh" +#include "velox/experimental/wave/common/HashTable.cuh" #include "velox/experimental/wave/common/tests/BlockTest.h" +#include "velox/experimental/wave/common/tests/HashTestUtil.h" +#include "velox/experimental/wave/common/tests/Updates.cuh" namespace facebook::velox::wave { using ScanAlgorithm = cub::BlockScan; -__global__ void boolToIndices( - uint8_t** bools, - int32_t** indices, - int32_t* sizes, - int64_t* times) { +__global__ void +boolToIndicesKernel(uint8_t** bools, int32_t** indices, int32_t* sizes) { extern __shared__ char smem[]; int32_t idx = blockIdx.x; // Start cycle timer - clock_t start = clock(); uint8_t* blockBools = bools[idx]; boolBlockToIndices<256>( [&]() { return blockBools[threadIdx.x]; }, @@ -22,25 +37,111 @@ __global__ void boolToIndices( indices[idx], smem, sizes[idx]); - clock_t stop = clock(); - if (threadIdx.x == 0) { - times[idx] = (start > stop) ? start - stop : stop - start; - } } void BlockTestStream::testBoolToIndices( int32_t numBlocks, uint8_t** flags, int32_t** indices, - int32_t* sizes, - int64_t* times) { + int32_t* sizes) { CUDA_CHECK(cudaGetLastError()); auto tempBytes = sizeof(typename ScanAlgorithm::TempStorage); - boolToIndices<<stream>>>( - flags, indices, sizes, times); + boolToIndicesKernel<<stream>>>( + flags, indices, sizes); + CUDA_CHECK(cudaGetLastError()); +} + +__global__ void boolToIndicesNoSharedKernel( + uint8_t** bools, + int32_t** indices, + int32_t* sizes, + void* temp) { + int32_t idx = blockIdx.x; + + uint8_t* blockBools = bools[idx]; + char* smem = reinterpret_cast(temp) + + blockIdx.x * sizeof(typename ScanAlgorithm::TempStorage); + boolBlockToIndices<256>( + [&]() { return blockBools[threadIdx.x]; }, + idx * 256, + indices[idx], + smem, + sizes[idx]); +} + +void BlockTestStream::testBoolToIndicesNoShared( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes, + void* temp) { + CUDA_CHECK(cudaGetLastError()); + boolToIndicesNoSharedKernel<<stream>>>( + flags, indices, sizes, temp); CUDA_CHECK(cudaGetLastError()); } +int32_t BlockTestStream::boolToIndicesSize() { + return sizeof(typename ScanAlgorithm::TempStorage); +} + +__global__ void +bool256ToIndicesKernel(uint8_t** bools, int32_t** indices, int32_t* sizes) { + extern __shared__ char smem[]; + int32_t idx = blockIdx.x; + auto* bool64 = reinterpret_cast(bools[idx]); + bool256ToIndices( + [&](int32_t index8) { return bool64[index8]; }, + idx * 256, + indices[idx], + sizes[idx], + smem); +} + +void BlockTestStream::testBool256ToIndices( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes) { + CUDA_CHECK(cudaGetLastError()); + auto tempBytes = bool256ToIndicesSize(); + bool256ToIndicesKernel<<stream>>>( + flags, indices, sizes); + CUDA_CHECK(cudaGetLastError()); +} + +__global__ void bool256ToIndicesNoSharedKernel( + uint8_t** bools, + int32_t** indices, + int32_t* sizes, + void* temp) { + int32_t idx = blockIdx.x; + auto* bool64 = reinterpret_cast(bools[idx]); + char* smem = reinterpret_cast(temp) + blockIdx.x * 80; + bool256ToIndices( + [&](int32_t index8) { return bool64[index8]; }, + idx * 256, + indices[idx], + sizes[idx], + smem); +} + +void BlockTestStream::testBool256ToIndicesNoShared( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes, + void* temp) { + CUDA_CHECK(cudaGetLastError()); + bool256ToIndicesNoSharedKernel<<stream>>>( + flags, indices, sizes, temp); + CUDA_CHECK(cudaGetLastError()); +} + +int32_t BlockTestStream::bool256ToIndicesSize() { + return 80; +} + __global__ void sum64(int64_t* numbers, int64_t* results) { extern __shared__ char smem[]; int32_t idx = blockIdx.x; @@ -57,4 +158,417 @@ void BlockTestStream::testSum64( CUDA_CHECK(cudaGetLastError()); } +/// Keys and values are n sections of 8K items. The items in each section get +/// sorted on the key. +void __global__ __launch_bounds__(1024) + testSort(uint16_t** keys, uint16_t** values) { + extern __shared__ __align__(16) char smem[]; + auto keyBase = keys[blockIdx.x]; + auto valueBase = values[blockIdx.x]; + blockSort<256, 32>( + [&](auto i) { return keyBase[i]; }, + [&](auto i) { return valueBase[i]; }, + keys[blockIdx.x], + values[blockIdx.x], + smem); +} + +void __global__ __launch_bounds__(1024) + testSortNoShared(uint16_t** keys, uint16_t** values, char* smem) { + auto keyBase = keys[blockIdx.x]; + auto valueBase = values[blockIdx.x]; + char* tbTemp = smem + + blockIdx.x * + sizeof(typename cub::BlockRadixSort:: + TempStorage); + + blockSort<256, 32>( + [&](auto i) { return keyBase[i]; }, + [&](auto i) { return valueBase[i]; }, + keys[blockIdx.x], + values[blockIdx.x], + tbTemp); +} + +int32_t BlockTestStream::sort16SharedSize() { + return sizeof( + typename cub::BlockRadixSort::TempStorage); +} + +void BlockTestStream::testSort16( + int32_t numBlocks, + uint16_t** keys, + uint16_t** values) { + auto tempBytes = sizeof( + typename cub::BlockRadixSort::TempStorage); + + testSort<<stream>>>(keys, values); +} + +void BlockTestStream::testSort16NoShared( + int32_t numBlocks, + uint16_t** keys, + uint16_t** values, + char* temp) { + testSortNoShared<<stream>>>(keys, values, temp); +} + +/// Calls partitionRows on each thread block of 256 threads. The parameters +/// correspond to 'partitionRows'. Each is an array subscripted by blockIdx.x. +void __global__ partitionShortsKernel( + uint16_t** keys, + int32_t* numKeys, + int32_t numPartitions, + int32_t** ranks, + int32_t** partitionStarts, + int32_t** partitionedRows) { + partitionRows<256>( + [&](auto i) { return keys[blockIdx.x][i]; }, + numKeys[blockIdx.x], + numPartitions, + ranks[blockIdx.x], + partitionStarts[blockIdx.x], + partitionedRows[blockIdx.x]); +} + +void BlockTestStream::partitionShorts( + int32_t numBlocks, + uint16_t** keys, + int32_t* numKeys, + int32_t numPartitions, + int32_t** ranks, + int32_t** partitionStarts, + int32_t** partitionedRows) { + constexpr int32_t kBlockSize = 256; + auto shared = partitionRowsSharedSize(numPartitions); + partitionShortsKernel<<stream>>>( + keys, numKeys, numPartitions, ranks, partitionStarts, partitionedRows); + CUDA_CHECK(cudaGetLastError()); +} + +/// A mock complex accumulator update function. +ProbeState __device__ arrayAgg64Append( + ArrayAgg64* accumulator, + int64_t arg, + RowAllocator* allocator) { + auto* last = accumulator->last; + if (!last || accumulator->numInLast >= sizeof(last->data) / sizeof(int64_t)) { + auto* next = allocator->allocate(1); + if (!next) { + return ProbeState::kNeedSpace; + } + next->next = nullptr; + if (accumulator->last) { + accumulator->last->next = next; + accumulator->last = next; + } else { + accumulator->first = accumulator->last = next; + } + } + accumulator->last->data[accumulator->numInLast++] = arg; + return ProbeState::kDone; +} + +/// An mock Ops parameter class to do group by. +class MockGroupByOps { + public: + int32_t __device__ blockBase(HashProbe* probe) { + return probe->numRowsPerThread * blockDim.x * blockIdx.x; + } + + int32_t __device__ numRowsInBlock(HashProbe* probe) { + return probe->numRows[blockIdx.x]; + } + + uint64_t __device__ hash(int32_t i, HashProbe* probe) { + auto key = reinterpret_cast(probe->keys)[0]; + return hashMix(1, key[i]); + } + + bool __device__ + compare(GpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) { + return row->key == reinterpret_cast(probe->keys)[0][i]; + } + + TestingRow* __device__ + newRow(GpuHashTable* table, int32_t partition, int32_t i, HashProbe* probe) { + auto* allocator = &table->allocators[partition]; + auto row = allocator->allocateRow(); + if (row) { + row->key = reinterpret_cast(probe->keys)[0][i]; + row->flags = 0; + row->count = 0; + new (&row->concatenation) ArrayAgg64(); + } + return row; + } + + ProbeState __device__ insert( + GpuHashTable* table, + int32_t partition, + GpuBucket* bucket, + uint32_t misses, + uint32_t oldTags, + uint32_t tagWord, + int32_t i, + HashProbe* probe, + TestingRow*& row) { + if (!row) { + row = newRow(table, partition, i, probe); + if (!row) { + return ProbeState::kNeedSpace; + } + } + auto missShift = __ffs(misses) - 1; + if (!bucket->addNewTag(tagWord, oldTags, missShift)) { + return ProbeState::kRetry; + } + bucket->store(missShift / 8, row); + return ProbeState::kDone; + } + + TestingRow* __device__ getExclusive( + GpuHashTable* table, + GpuBucket* bucket, + TestingRow* row, + int32_t hitIdx, + int32_t warp) { + return row; + int32_t nanos = 1; + for (;;) { + if (atomicTryLock(&row->flags)) { + return row; + } + __nanosleep((nanos + threadIdx.x) & 31); + nanos += 3; + } + } + + void __device__ writeDone(TestingRow* row) { + // atomicUnlock(&row->flags); + } + + ProbeState __device__ update( + GpuHashTable* table, + GpuBucket* bucket, + TestingRow* row, + int32_t i, + HashProbe* probe) { + auto* keys = reinterpret_cast(probe->keys); + atomicAdd((unsigned long long*)&row->count, (unsigned long long)keys[1][i]); + return ProbeState::kDone; + int64_t arg = keys[1][i]; + int32_t part = table->partitionIdx(bucket - table->buckets); + auto* allocator = &table->allocators[part]; + auto state = arrayAgg64Append(&row->concatenation, arg, allocator); + row->flags = 0; + __threadfence(); + return state; + } +}; + +void __global__ __launch_bounds__(1024) hashTestKernel( + GpuHashTable* table, + HashProbe* probe, + BlockTestStream::HashCase mode) { + switch (mode) { + case BlockTestStream::HashCase::kGroup: { + table->updatingProbe(probe, MockGroupByOps()); + break; + } + case BlockTestStream::HashCase::kBuild: + case BlockTestStream::HashCase::kProbe: + *(long*)0 = 0; // Unimplemented. + } +} + +void BlockTestStream::hashTest( + GpuHashTableBase* table, + HashRun& run, + HashCase mode) { + constexpr int32_t kBlockSize = 256; + int32_t shared = 0; + if (mode == HashCase::kGroup) { + shared = GpuHashTable::updatingProbeSharedSize(); + } + hashTestKernel<<stream>>>( + reinterpret_cast(table), run.probe, mode); + CUDA_CHECK(cudaGetLastError()); +} + +void __global__ allocatorTestKernel( + int32_t numAlloc, + int32_t numFree, + int32_t numStr, + AllocatorTestResult* allResults) { + auto* result = allResults + threadIdx.x + blockIdx.x * blockDim.x; + for (;;) { + int32_t maxRows = sizeof(result->rows) / sizeof(result->rows[0]); + int32_t maxStrings = sizeof(result->strings) / sizeof(result->strings[0]); + for (auto count = 0; count < numAlloc; ++count) { + if (result->numRows >= maxRows) { + return; + } + auto newRow = result->allocator->allocateRow(); + if (newRow == nullptr) { + return; + } + if (reinterpret_cast(newRow) == result->allocator->base) { + printf(""); + } + + result->rows[result->numRows++] = newRow; + } + for (auto count = 0; count < numFree; ++count) { + if (result->numRows == 0) { + return; + } + auto* toFree = result->rows[--result->numRows]; + if (reinterpret_cast(toFree) == result->allocator->base) { + printf(""); // GPF(); + } + if (!result->allocator->inRange(toFree)) { + GPF(); + } + result->allocator->freeRow(toFree); + } + for (auto count = 0; count < numStr; ++count) { + if (result->numStrings >= maxStrings) { + return; + } + auto str = result->allocator->allocate(11); + if (!str) { + return; + } + result->strings[result->numStrings++] = reinterpret_cast(str); + } + } +} + +void __global__ initAllocatorKernel(RowAllocator* allocator) { + if (threadIdx.x == 0) { + if (allocator->freeSet) { + reinterpret_cast*>(allocator->freeSet)->clear(); + } + } +} + +// static +int32_t BlockTestStream::freeSetSize() { + return sizeof(FreeSet); +} + +void BlockTestStream::initAllocator(HashPartitionAllocator* allocator) { + initAllocatorKernel<<<1, 1, 0, stream_->stream>>>( + reinterpret_cast(allocator)); + CUDA_CHECK(cudaGetLastError()); +} + +void BlockTestStream::rowAllocatorTest( + int32_t numBlocks, + int32_t numAlloc, + int32_t numFree, + int32_t numStr, + AllocatorTestResult* results) { + allocatorTestKernel<<stream>>>( + numAlloc, numFree, numStr, results); + CUDA_CHECK(cudaGetLastError()); +} + +#define UPDATE_CASE(name, func, smem) \ + void __global__ name##Kernel(TestingRow* rows, HashProbe* probe) { \ + func(rows, probe); \ + } \ + \ + void BlockTestStream::name(TestingRow* rows, HashRun& run) { \ + name##Kernel<<stream>>>( \ + rows, run.probe); \ + CUDA_CHECK(cudaGetLastError()); \ + } + +UPDATE_CASE(updateSum1NoSync, testSumNoSync, 0); +UPDATE_CASE(updateSum1Mtx, testSumMtx, 0); +UPDATE_CASE(updateSum1MtxCoalesce, testSumMtxCoalesce, 0); +UPDATE_CASE(updateSum1Atomic, testSumAtomic, 0); +UPDATE_CASE(updateSum1AtomicCoalesce, testSumAtomicCoalesce, 0); +UPDATE_CASE(updateSum1Exch, testSumExch, sizeof(ProbeShared)); +UPDATE_CASE(updateSum1Order, testSumOrder, 0); + +void __global__ __launch_bounds__(1024) update1PartitionKernel( + int32_t numRows, + int32_t numDistinct, + int32_t numParts, + int32_t blockStride, + HashProbe* probe, + int32_t* temp) { + auto blockStart = blockStride * blockIdx.x; + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + partitionRows<256, int32_t>( + [&](auto i) -> int32_t { return indices[i + blockStart] % numParts; }, + blockIdx.x == blockDim.x - 1 ? numRows - blockStart : blockStride, + numParts, + temp + blockIdx.x * blockStride, + probe->hostRetries + blockStride * blockIdx.x, + probe->kernelRetries1 + blockStride * blockIdx.x); +} + +void __global__ updateSum1PartKernel( + TestingRow* rows, + int32_t numParts, + HashProbe* probe, + int32_t numGroups, + int32_t groupStride) { + testSumPart( + rows, + numParts, + probe, + probe->kernelRetries1, + probe->hostRetries, + numGroups, + groupStride); +} + +void BlockTestStream::updateSum1Part(TestingRow* rows, HashRun& run) { + auto numParts = std::min(run.numDistinct, 8192); + auto groupStride = run.numRows / 32; + auto numGroups = run.numRows / groupStride; + auto partSmem = partitionRowsSharedSize<256>(numParts); + // We use probe->kernelRetries1 as the indices array for partitions. We use + // probe->hostRetries as the array of partition starts. So, if we have 10 + // partitions, then hostRetries[x..y] is the input rows for partition 1 if x + // is partitionStarts[0] and y is partitionStarts[1]. + update1PartitionKernel<<stream>>>( + run.numRows, + run.numDistinct, + numParts, + groupStride, + run.probe, + run.partitionTemp); + CUDA_CHECK(cudaGetLastError()); + + int32_t blockSize = roundUp(std::min(256, numParts), 32); + int32_t numBlocks = numParts / blockSize; + // There will be one lane per partition. The last blocks may have empty lanes. + if (numBlocks * blockSize < numParts) { + ++numBlocks; + } + updateSum1PartKernel<<stream>>>( + rows, numParts, run.probe, numGroups, groupStride); + CUDA_CHECK(cudaGetLastError()); +} + +REGISTER_KERNEL("testSort", testSort); +REGISTER_KERNEL("boolToIndices", boolToIndicesKernel); +REGISTER_KERNEL("bool256ToIndices", bool256ToIndicesKernel); +REGISTER_KERNEL("sum64", sum64); +REGISTER_KERNEL("partitionShorts", partitionShortsKernel); +REGISTER_KERNEL("hashTest", hashTestKernel); +REGISTER_KERNEL("allocatorTest", allocatorTestKernel); +REGISTER_KERNEL("sum1atm", updateSum1AtomicKernel); +REGISTER_KERNEL("sum1atmCoa", updateSum1AtomicCoalesceKernel); +REGISTER_KERNEL("sum1Exch", updateSum1ExchKernel); +REGISTER_KERNEL("sum1Part", updateSum1PartKernel); +REGISTER_KERNEL("partSum", update1PartitionKernel); + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/BlockTest.h b/velox/experimental/wave/common/tests/BlockTest.h index bed782ff14ade..63d16b9f66a07 100644 --- a/velox/experimental/wave/common/tests/BlockTest.h +++ b/velox/experimental/wave/common/tests/BlockTest.h @@ -17,31 +17,142 @@ #pragma once #include "velox/experimental/wave/common/Cuda.h" +#include "velox/experimental/wave/common/HashTable.h" +#include "velox/experimental/wave/common/tests/HashTestUtil.h" -/// Sample header for testing Block.cuh +/// Sample header for testing Wave Utilities. namespace facebook::velox::wave { +constexpr uint32_t kPrime32 = 1815531889; + +/// A mock aggregate that concatenates numbers, like array_agg of bigint. +struct ArrayAgg64 { + struct Run { + Run* next; + int64_t data[16]; + }; + + Run* first{nullptr}; + Run* last{nullptr}; + // Fill of 'last->data', all other runs are full. + int8_t numInLast{0}; +}; + +/// A mock hash table content row to test HashTable. +struct TestingRow { + // Single ke part. + int64_t key; + + // Count of updates. Sample aggregate + int64_t count{0}; + + // A mock concatenating aggregate. Use for testing control flow in + // running out of space in updating a group. + ArrayAgg64 concatenation; + + // Next pointer in the case simulating a non-unique join table. + TestingRow* next{nullptr}; + + // flags for updating the row. E.g. probed flag, marker for exclusive write. + int32_t flags{0}; +}; + +/// Result of allocator test kernel. +struct AllocatorTestResult { + RowAllocator* allocator; + int32_t numRows; + int32_t numStrings; + int64_t* rows[200000]; + int64_t* strings[200000]; +}; + class BlockTestStream : public Stream { public: /// In each block of 256 bools in bools[i], counts the number of /// true and writes the indices of true lanes into the corresponding /// indices[i]. Stors the number of true values to sizes[i]. void testBoolToIndices( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes); + void testBoolToIndicesNoShared( int32_t numBlocks, uint8_t** flags, int32_t** indices, int32_t* sizes, - int64_t* times); + void*); + + // Returns the smem size for block size 256 of boolToIndices(). + static int32_t boolToIndicesSize(); + + void testBool256ToIndices( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes); + + void testBool256ToIndicesNoShared( + int32_t numBlocks, + uint8_t** flags, + int32_t** indices, + int32_t* sizes, + void*); + + // Returns the smem size for bool256ToIndices(). + static int32_t bool256ToIndicesSize(); // calculates the sum over blocks of 256 int64s and returns the result for // numbers[i * 256] ... numbers[(i + 1) * 256 - 1] inclusive in results[i]. void testSum64(int32_t numBlocks, int64_t* numbers, int64_t* results); - /// Sorts 'rows'[i] using ids[i] as keys and stores the sorted order in - /// 'result[i]'. - // void dedup(int32_t numBlocks, uint16_t** ids, uint16_t** rows, uint16_t** - // resultRows); + static int32_t sort16SharedSize(); + + void testSort16(int32_t numBlocks, uint16_t** keys, uint16_t** values); + void testSort16NoShared( + int32_t numBlocks, + uint16_t** keys, + uint16_t** values, + char* temp); + + void partitionShorts( + int32_t numBlocks, + uint16_t** keys, + int32_t* numKeys, + int32_t numPartitions, + int32_t** ranks, + int32_t** partitionStarts, + int32_t** partitionedRows); + + // Operation for hash table tests. + enum class HashCase { kGroup, kBuild, kProbe }; + + /// Does probe/groupby/build on 'table'. 'probe' contains the parameters and + /// temp storage. 'table' and 'probe' are expected to be resident on device. + /// 'numBlocks' gives how many TBs are run, the rows per TB are in 'probe'. + void hashTest(GpuHashTableBase* table, HashRun& probe, HashCase mode); + + static int32_t freeSetSize(); + + void initAllocator(HashPartitionAllocator* allocator); + + /// tests RowAllocator. + void rowAllocatorTest( + int32_t numBlocks, + int32_t numAlloc, + int32_t numFree, + int32_t numStr, + AllocatorTestResult* results); + + void updateSum1Atomic(TestingRow* rows, HashRun& run); + void updateSum1Exch(TestingRow* rows, HashRun& run); + void updateSum1NoSync(TestingRow* rows, HashRun& run); + void updateSum1AtomicCoalesce(TestingRow* rows, HashRun& run); + void updateSum1Part(TestingRow* rows, HashRun& run); + void updateSum1Mtx(TestingRow* rows, HashRun& run); + void updateSum1MtxCoalesce(TestingRow* rows, HashRun& run); + void updateSum1Order(TestingRow* rows, HashRun& run); }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt index f9d2a3305eec9..8914f6a5357cd 100644 --- a/velox/experimental/wave/common/tests/CMakeLists.txt +++ b/velox/experimental/wave/common/tests/CMakeLists.txt @@ -12,8 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu - BlockTest.cpp BlockTest.cu) +add_executable( + velox_wave_common_test + GpuArenaTest.cpp + CudaTest.cpp + CudaTest.cu + BlockTest.cpp + BlockTest.cu + HashTableTest.cpp + HashTestUtil.cpp) add_test(velox_wave_common_test velox_wave_common_test) diff --git a/velox/experimental/wave/common/tests/CpuTable.h b/velox/experimental/wave/common/tests/CpuTable.h new file mode 100644 index 0000000000000..d960e11b61bdb --- /dev/null +++ b/velox/experimental/wave/common/tests/CpuTable.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/common/base/SimdUtil.h" + +namespace facebook::velox::wave { + +class CpuBucket { + public: +#if XSIMD_WITH_SSE2 + using TagVector = xsimd::batch; +#elif XSIMD_WITH_NEON + using TagVector = xsimd::batch; +#endif + + auto loadTags() { +#if XSIMD_WITH_SSE2 + return TagVector(_mm_loadu_si128(reinterpret_cast<__m128i const*>(tags_))); +#elif XSIMD_WITH_NEON + return TagVector(vld1q_u8(tags_)); +#endif + } + + void setTag(int32_t idx, uint8_t tag) { + tags_[idx] = tag; + } + + static inline uint16_t matchTags(TagVector tags, uint8_t tag) { + auto flags = TagVector::broadcast(tag) == tags; + return simd::toBitMask(flags); + } + + template + T* load(int32_t idx) { + uint64_t data = *reinterpret_cast(&data_[idx * 6]); + return reinterpret_cast(data & 0xffffffffffff); + } + + void store(uint32_t idx, void* row) { + auto uptr = reinterpret_cast(row); + uint64_t data = *reinterpret_cast(&data_[idx * 6]); + *reinterpret_cast(&data_[idx * 6]) = + (data & 0xffff000000000000) | uptr; + } + + private: + uint8_t tags_[16]; + uint8_t data_[128 - 16]; +}; + +struct CpuHashTable { + CpuHashTable() = default; + + CpuHashTable(int32_t numSlots, int32_t rowBytes) { + auto numBuckets = bits::nextPowerOfTwo(numSlots) / 16; + assert(numBuckets > 0); + sizeMask = numBuckets - 1; + bucketSpace.resize(numBuckets * sizeof(CpuBucket) + 64); + buckets = reinterpret_cast( + bits::roundUp(reinterpret_cast(bucketSpace.data()), 64)); + rows.resize(rowBytes); + } + + std::string bucketSpace; + + CpuBucket* buckets; + + int32_t sizeMask; + + // Preallocated space for rows. Do not resize. + std::string rows; + + // Number of used bytes in 'rows'. + int32_t spaceUsed{0}; + + // Number of entries. + int32_t size{0}; + + template + T* newRow() { + auto size = sizeof(T); + if (spaceUsed + size > rows.size()) { + return nullptr; + } + auto row = reinterpret_cast(rows.data() + spaceUsed); + spaceUsed += size; + return row; + } + + template + RowType* find(int64_t key, uint64_t h, Ops ops) const { + uint8_t tag = 0x80 | (h >> 32); + int32_t bucketIdx = h & sizeMask; + for (;;) { + auto tags = buckets[bucketIdx].loadTags(); + auto hits = CpuBucket::matchTags(tags, tag); + while (hits) { + auto idx = bits::getAndClearLastSetBit(hits); + auto row = buckets[bucketIdx].load(idx); + if (ops.compare1(this, row, key)) { + return row; + } + } + auto misses = CpuBucket::matchTags(tags, 0); + if (misses) { + return nullptr; + } + bucketIdx = (1 + bucketIdx) & sizeMask; + } + } + + template + void updatingProbe(int32_t numRows, HashProbe* probe, Ops ops) { + for (auto i = 0; i < numRows; ++i) { + auto h = probe->hashes[i]; + uint8_t tag = 0x80 | (h >> 32); + auto bucketIdx = h & sizeMask; + for (;;) { + auto tags = buckets[bucketIdx].loadTags(); + auto hits = CpuBucket::matchTags(tags, tag); + while (hits) { + auto idx = bits::getAndClearLastSetBit(hits); + auto row = buckets[bucketIdx].load(idx); + if (ops.compare(this, row, i, probe)) { + ops.update(this, row, i, probe); + goto done; + } + } + auto misses = CpuBucket::matchTags(tags, 0); + if (misses) { + int32_t idx = bits::getAndClearLastSetBit(misses); + buckets[bucketIdx].setTag(idx, tag); + auto* newRow = ops.newRow(this, i, probe); + buckets[bucketIdx].store(idx, newRow); + ++size; + ops.update(this, newRow, i, probe); + break; + } + bucketIdx = (bucketIdx + 1) & sizeMask; + } + done:; + } + } + + void check() { + for (auto i = 0; i <= sizeMask; ++i) { + for (auto j = 0; j < 16; j++) { + auto row = buckets[i].load(j); + if (!row || (row >= rows.data() && row < rows.data() + rows.size())) { + continue; + } + VELOX_FAIL(); + } + } + } +}; + +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/CudaTest.cpp b/velox/experimental/wave/common/tests/CudaTest.cpp index c40351d287c04..167b83c9534af 100644 --- a/velox/experimental/wave/common/tests/CudaTest.cpp +++ b/velox/experimental/wave/common/tests/CudaTest.cpp @@ -32,9 +32,14 @@ #include "velox/common/time/Timer.h" #include "velox/experimental/wave/common/GpuArena.h" #include "velox/experimental/wave/common/tests/BlockTest.h" +#include "velox/experimental/wave/common/tests/CpuTable.h" +#include "velox/experimental/wave/common/tests/HashTestUtil.h" +#include "velox/experimental/wave/common/tests/Util.h" #include +DEFINE_bool(list_kernels, false, "Lists kernel occupancy and registers"); + DEFINE_int32(num_streams, 0, "Number of paralll streams"); DEFINE_int32(op_size, 0, "Size of invoke kernel (ints read and written)"); DEFINE_int32( @@ -480,19 +485,30 @@ struct RoundtripStats { std::string toString() const { return fmt::format( - "{}: rps={} gips={} mode={} threads={} micros={} avgus={} toDev={} GB/s toHost={} GB/s", + "{}: rps={:.2f} gips={:.4f} mode={} threads={} micros={} avgus={:.2f} toDev={:.2f} GB/s toHost={:.2f} GB/s", id, - (numThreads * numOps) / (micros / 1000000), - numAdds / (micros * 1000), + (numThreads * numOps) / (micros / 1000000.0), + numAdds / (micros * 1000.0), mode, numThreads, micros, micros / numOps, - toDeviceBytes / (micros * 1000), - toHostBytes / (micros * 1000)); + toDeviceBytes / (micros * 1000.0), + toHostBytes / (micros * 1000.0)); } }; +// Checks a number for being prime. Returns 0 for prime and a factor for others. +int64_t factor(int64_t n) { + int64_t end = sqrt(n); + for (int64_t f = 3; f < end; f += 2) { + if (n % f == 0) { + return f; + } + } + return 0; +} + /// Describes one thread of execution in round trip measurement. Each thread /// does a sequence of data transfers, kernel calls and synchronizations. The /// operations are described in a string of the form: @@ -510,12 +526,13 @@ struct RoundtripStats { /// stream with record event + wait event. class RoundtripThread { public: - // Up to 32 MB of ints. - static constexpr int32_t kNumKB = 32 << 10; + // Up to 64 MB of ints. + static constexpr int32_t kNumKB = 64 << 10; static constexpr int32_t kNumInts = kNumKB * 256; RoundtripThread(int32_t device, ArenaSet* arenas) : arenas_(arenas) { - setDevice(getDevice(device)); + device_ = getDevice(device); + setDevice(device_); hostBuffer_ = arenas_->host->allocate(kNumInts); deviceBuffer_ = arenas_->device->allocate(kNumInts); lookupBuffer_ = arenas_->device->allocate(kNumInts); @@ -536,13 +553,26 @@ class RoundtripThread { hostLookup_.get(), hostBuffer_->as(), kNumInts * sizeof(int32_t)); + serial_ = ++serialCounter_; + } + + ~RoundtripThread() { + try { + stream_->wait(); + } catch (const std::exception& e) { + LOG(ERROR) << "Error in sync on ~RoundtripThread(): " << e.what(); + } } enum class OpCode { kToDevice, kToHost, kAdd, + kAddShared, + kAddReg, kAddRandom, + kAddRandomEmptyWarps, + kAddRandomEmptyThreads, kWideAdd, kEnd, kSync, @@ -553,6 +583,9 @@ class RoundtripThread { OpCode opCode; int32_t param1{1}; int32_t param2{0}; + int32_t param3{0}; + int32_t param4{0}; + int32_t param5{0}; }; void run(RoundtripStats& stats) { @@ -606,6 +639,27 @@ class RoundtripThread { } stats.numAdds += op.param1 * op.param2 * 256; break; + case OpCode::kAddShared: + VELOX_CHECK_LE(op.param1, kNumKB); + if (stats.isCpu) { + addOneCpu(op.param1 * 256, op.param2); + } else { + stream_->addOneShared( + deviceBuffer_->as(), op.param1 * 256, op.param2); + } + stats.numAdds += op.param1 * op.param2 * 256; + break; + case OpCode::kAddReg: + VELOX_CHECK_LE(op.param1, kNumKB); + if (stats.isCpu) { + addOneCpu(op.param1 * 256, op.param2); + } else { + stream_->addOneShared( + deviceBuffer_->as(), op.param1 * 256, op.param2); + } + stats.numAdds += op.param1 * op.param2 * 256; + break; + case OpCode::kWideAdd: VELOX_CHECK_LE(op.param1, kNumKB); if (stats.isCpu) { @@ -618,15 +672,22 @@ class RoundtripThread { break; case OpCode::kAddRandom: + case OpCode::kAddRandomEmptyWarps: + case OpCode::kAddRandomEmptyThreads: VELOX_CHECK_LE(op.param1, kNumKB); if (stats.isCpu) { - addOneRandomCpu(op.param1 * 256, op.param2); + addOneRandomCpu(op.param1 * 256, op.param2, op.param4, op.param5); } else { stream_->addOneRandom( deviceBuffer_->as(), lookupBuffer_->as(), op.param1 * 256, - op.param2); + op.param2, + op.param3, + op.param4, + op.param5, + op.opCode == OpCode::kAddRandomEmptyWarps, + op.opCode == OpCode::kAddRandomEmptyThreads); } stats.numAdds += op.param1 * op.param2 * 256; break; @@ -653,7 +714,7 @@ class RoundtripThread { stats.endMicros = getCurrentTimeMicro(); } - void addOneCpu(int32_t size, int32_t repeat) { + FOLLY_NOINLINE void addOneCpu(int32_t size, int32_t repeat) { int32_t* ints = hostInts_.get(); for (auto counter = 0; counter < repeat; ++counter) { for (auto i = 0; i < size; ++i) { @@ -661,16 +722,23 @@ class RoundtripThread { } } } - void addOneRandomCpu(uint32_t size, int32_t repeat) { + FOLLY_NOINLINE void addOneRandomCpu( + uint32_t size, + int32_t repeat, + int32_t numLocal, + int32_t localStride) { int32_t* ints = hostInts_.get(); int32_t* lookup = hostLookup_.get(); for (uint32_t counter = 0; counter < repeat; ++counter) { for (auto i = 0; i < size; ++i) { - auto rnd = (static_cast( - static_cast(i * (counter + 1) * 1367836089)) * - size) >> - 32; - ints[i] += lookup[rnd]; + auto rnd = scale32(i * (counter + 1) * kPrime32, size); + auto sum = lookup[rnd]; + auto limit = + std::min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + ints[i] += sum; } } } @@ -699,6 +767,13 @@ class RoundtripThread { case 'a': op.opCode = OpCode::kAdd; ++position; + if (str[position] == 's') { + op.opCode = OpCode::kAddShared; + ++position; + } else if (str[position] == 'r') { + op.opCode = OpCode::kAddReg; + ++position; + } op.param1 = parseInt(str, position, 1); op.param2 = parseInt(str, position, 1); return op; @@ -710,10 +785,26 @@ class RoundtripThread { return op; case 'r': - op.opCode = OpCode::kAddRandom; ++position; + if (str[position] == 'w') { + op.opCode = OpCode::kAddRandomEmptyWarps; + ++position; + } else if (str[position] == 't') { + op.opCode = OpCode::kAddRandomEmptyThreads; + ++position; + } else { + op.opCode = OpCode::kAddRandom; + } + // Size of data to update and lookup array (KB). op.param1 = parseInt(str, position, 1); + // Number of repeats. op.param2 = parseInt(str, position, 1); + // target number of threads in kernel. + op.param3 = parseInt(str, position, 10240); + // Number of nearby memory accesses + op.param4 = parseInt(str, position, 0); + // Stride of nearby memory accesses + op.param5 = parseInt(str, position, 0); return op; case 's': @@ -750,6 +841,7 @@ class RoundtripThread { } ArenaSet* const arenas_; + Device* device_{nullptr}; WaveBufferPtr deviceBuffer_; WaveBufferPtr hostBuffer_; WaveBufferPtr lookupBuffer_; @@ -757,6 +849,8 @@ class RoundtripThread { std::unique_ptr hostInts_; std::unique_ptr stream_; std::unique_ptr event_; + int32_t serial_{0}; + static inline std::atomic serialCounter_{0}; }; class CudaTest : public testing::Test { @@ -849,7 +943,7 @@ class CudaTest : public testing::Test { waitEach(streams, events); } for (auto i = 0; i < numStreams; ++i) { - streams[i]->addOne(ints[i], opSize); + streams[i]->incOne(ints[i], opSize); if (counter == 0 || counter >= firstNotify) { streams[i]->addCallback([&]() { auto d = getCurrentTimeMicro() - start; @@ -1072,7 +1166,7 @@ class CudaTest : public testing::Test { int numOps = 10000) { auto arenas = getArenas(); std::vector allStats; - std::vector numThreadsValues = {2, 4, 8, 16, 32}; + std::vector numThreadsValues = {1, 2, 4, 8, 16, 32}; int32_t ordinal = 0; for (auto numThreads : numThreadsValues) { std::vector runStats; @@ -1169,7 +1263,7 @@ TEST_F(CudaTest, stream) { stream.prefetch(nullptr, ints, opSize * sizeof(int32_t)); stream.wait(); for (auto i = 0; i < opSize; ++i) { - ASSERT_EQ(ints[i], i + 1); + ASSERT_EQ(ints[i], i + (i & 31)); } allocator_->free(ints, sizeof(int32_t) * opSize); } @@ -1284,9 +1378,8 @@ TEST_F(CudaTest, roundtripMatrix) { if (!FLAGS_roundtrip_ops.empty()) { std::vector modes = {FLAGS_roundtrip_ops}; roundtripTest( - fmt::format("{} GPU, 1000 repeats", modes[0]), modes, false, 1000); - roundtripTest( - fmt::format("{} CPU, 100 repeats", modes[0]), modes, true, 100); + fmt::format("{} GPU, 64 repeats", modes[0]), modes, false, 64); + roundtripTest(fmt::format("{} CPU, 32 repeats", modes[0]), modes, true, 32); return; } if (!FLAGS_enable_bm) { @@ -1313,8 +1406,8 @@ TEST_F(CudaTest, roundtripMatrix) { "d1000a1000,30h1sd1a1000,30h1s", "d1000a1000,150h1sd1a1000,150h1s", }; - roundtripTest("Seq GPU", seqModeValues, false, 1024); - roundtripTest("Seq CPU", seqModeValues, true, 64); + roundtripTest("Seq GPU", seqModeValues, false, 32); + roundtripTest("Seq CPU", seqModeValues, true, 16); std::vector randomModeValues = { "d100r100,10h1s", @@ -1325,8 +1418,85 @@ TEST_F(CudaTest, roundtripMatrix) { "d1000r1000,100h1s", "d10000r10000,10h1s", "d30000r30000,50h1s"}; - roundtripTest("Random GPU", randomModeValues, false, 512); - roundtripTest("Random CPU", randomModeValues, true, 16); + roundtripTest("Random GPU", randomModeValues, false, 16); + roundtripTest("Random CPU", randomModeValues, true, 8); + + std::vector widthModeValues = { + "d100r100,10,256h1s", + "d100r100,10,1024", + "d100r100,10,8192", + "d30000r30000,5,256h1s", + "d30000r30000,5,256h1s", + "d30000r30000,5,512h1s", + "d30000r30000,5,2048h1s", + "d30000r30000,5,10240h1s", + "d30000rw30000,5,10240h1s", + "d30000rt30000,5,10240h1s"}; + roundtripTest("Random GPU, width and conditional", widthModeValues, false, 8); +} + +TEST_F(CudaTest, addRandom) { + constexpr int32_t kNumInts = 16 << 20; + auto arenas = getArenas(); + auto stream = std::make_unique(); + auto indices = arenas->unified->allocate(kNumInts); + auto sourceBuffer = arenas->unified->allocate(kNumInts); + auto rawIndices = indices->as(); + for (auto i = 0; i < kNumInts; ++i) { + rawIndices[i] = i + 1; + } + stream->prefetch(getDevice(), rawIndices, indices->capacity()); + auto ints1 = arenas->unified->allocate(kNumInts); + auto rawInts1 = ints1->as(); + auto ints2 = arenas->unified->allocate(kNumInts); + auto rawInts2 = ints2->as(); + auto ints3 = arenas->unified->allocate(kNumInts); + auto rawInts3 = ints3->as(); + memset(rawInts1, 0, kNumInts * sizeof(int32_t)); + memset(rawInts2, 0, kNumInts * sizeof(int32_t)); + memset(rawInts3, 0, kNumInts * sizeof(int32_t)); + stream->prefetch(getDevice(), rawInts1, ints1->capacity()); + stream->prefetch(getDevice(), rawInts2, ints2->capacity()); + stream->prefetch(getDevice(), rawInts3, ints3->capacity()); + // Let prefetch finish. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // warm up. + stream->addOneRandom(rawInts1, rawIndices, kNumInts, 20, 10240); + stream->addOneRandom(rawInts2, rawIndices, kNumInts, 20, 10240, true); + stream->addOneRandom(rawInts3, rawIndices, kNumInts, 20, 10240, false, true); + stream->wait(); + + uint64_t time1 = 0; + uint64_t time2 = 0; + uint64_t time3 = 0; + for (auto count = 0; count < 20; ++count) { + { + MicrosecondTimer t(&time1); + stream->addOneRandom(rawInts1, rawIndices, kNumInts, 20, 10240); + stream->wait(); + } + { + MicrosecondTimer t(&time2); + stream->addOneRandom(rawInts2, rawIndices, kNumInts, 20, 10240, true); + stream->wait(); + } + { + MicrosecondTimer t(&time3); + stream->addOneRandom( + rawInts3, rawIndices, kNumInts, 20, 10240, false, true); + stream->wait(); + } + } + std::cout << fmt::format( + "All {}, half warps {} half threads {}", time1, time2, time3) + << std::endl; + + stream->prefetch(nullptr, rawInts1, ints1->capacity()); + stream->prefetch(nullptr, rawInts2, ints2->capacity()); + stream->prefetch(nullptr, rawInts3, ints3->capacity()); + + EXPECT_EQ(0, memcmp(rawInts1, rawInts2, kNumInts * sizeof(int32_t))); + EXPECT_EQ(0, memcmp(rawInts1, rawInts3, kNumInts * sizeof(int32_t))); } int main(int argc, char** argv) { @@ -1336,5 +1506,8 @@ int main(int argc, char** argv) { LOG(WARNING) << "No CUDA detected, skipping all tests"; return 0; } + if (FLAGS_list_kernels) { + printKernels(); + } return RUN_ALL_TESTS(); } diff --git a/velox/experimental/wave/common/tests/CudaTest.cu b/velox/experimental/wave/common/tests/CudaTest.cu index da529db26144a..14f97f577a3c3 100644 --- a/velox/experimental/wave/common/tests/CudaTest.cu +++ b/velox/experimental/wave/common/tests/CudaTest.cu @@ -14,57 +14,164 @@ * limitations under the License. */ +#include "velox/experimental/wave/common/Block.cuh" #include "velox/experimental/wave/common/CudaUtil.cuh" #include "velox/experimental/wave/common/tests/CudaTest.h" namespace facebook::velox::wave { +constexpr uint32_t kPrime32 = 1815531889; __global__ void -addOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) { - auto index = blockDim.x * blockIdx.x + threadIdx.x; +incOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) { for (auto counter = 0; counter < repeats; ++counter) { - for (; index < size; index += stride) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { ++numbers[index]; } __syncthreads(); } } -void TestStream::addOne(int32_t* numbers, int32_t size, int32_t repeats) { - constexpr int32_t kWidth = 10240; +__global__ void +addOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) { + for (auto counter = 0; counter < repeats; ++counter) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + numbers[index] += index & 31; + } + __syncthreads(); + } +} + +__global__ void addOneSharedKernel( + int32_t* numbers, + int32_t size, + int32_t stride, + int32_t repeats) { + extern __shared__ __align__(16) char smem[]; + int32_t* temp = reinterpret_cast(smem); + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + temp[threadIdx.x] = numbers[index]; + for (auto counter = 0; counter < repeats; ++counter) { + temp[threadIdx.x] += (index + counter) & 31; + } + __syncthreads(); + numbers[index] = temp[threadIdx.x]; + } +} + +__global__ void addOneRegKernel( + int32_t* numbers, + int32_t size, + int32_t stride, + int32_t repeats) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + auto temp = numbers[index]; + for (auto counter = 0; counter < repeats; ++counter) { + temp += (index + counter) & 31; + } + __syncthreads(); + numbers[index] = temp; + } +} + +void TestStream::incOne( + int32_t* numbers, + int32_t size, + int32_t repeats, + int32_t width) { constexpr int32_t kBlockSize = 256; auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; int32_t stride = size; - if (numBlocks > kWidth / kBlockSize) { - stride = kWidth; - numBlocks = kWidth / kBlockSize; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; + } + incOneKernel<<stream>>>( + numbers, size, stride, repeats); + CUDA_CHECK(cudaGetLastError()); +} + +void TestStream::addOne( + int32_t* numbers, + int32_t size, + int32_t repeats, + int32_t width) { + constexpr int32_t kBlockSize = 256; + auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; + int32_t stride = size; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; } addOneKernel<<stream>>>( numbers, size, stride, repeats); CUDA_CHECK(cudaGetLastError()); } +void TestStream::addOneShared( + int32_t* numbers, + int32_t size, + int32_t repeats, + int32_t width) { + constexpr int32_t kBlockSize = 256; + auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; + int32_t stride = size; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; + } + addOneSharedKernel<<< + numBlocks, + kBlockSize, + kBlockSize * sizeof(int32_t), + stream_->stream>>>(numbers, size, stride, repeats); + CUDA_CHECK(cudaGetLastError()); +} + +void TestStream::addOneReg( + int32_t* numbers, + int32_t size, + int32_t repeats, + int32_t width) { + constexpr int32_t kBlockSize = 256; + auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; + int32_t stride = size; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; + } + addOneRegKernel<<stream>>>( + numbers, size, stride, repeats); + CUDA_CHECK(cudaGetLastError()); +} + __global__ void addOneWideKernel(WideParams params) { - auto index = blockDim.x * blockIdx.x + threadIdx.x; auto numbers = params.numbers; auto size = params.size; auto repeat = params.repeat; auto stride = params.stride; for (auto counter = 0; counter < repeat; ++counter) { - for (; index < size; index += stride) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { ++numbers[index]; } } } -void TestStream::addOneWide(int32_t* numbers, int32_t size, int32_t repeat) { - constexpr int32_t kWidth = 10240; +void TestStream::addOneWide( + int32_t* numbers, + int32_t size, + int32_t repeat, + int32_t width) { constexpr int32_t kBlockSize = 256; auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; int32_t stride = size; - if (numBlocks > kWidth / kBlockSize) { - stride = kWidth; - numBlocks = kWidth / kBlockSize; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; } WideParams params; params.numbers = numbers; @@ -75,41 +182,108 @@ void TestStream::addOneWide(int32_t* numbers, int32_t size, int32_t repeat) { CUDA_CHECK(cudaGetLastError()); } -__global__ void addOneRandomKernel( +__global__ void __launch_bounds__(1024) addOneRandomKernel( int32_t* numbers, const int32_t* lookup, uint32_t size, int32_t stride, - int32_t repeats) { - auto index = blockDim.x * blockIdx.x + threadIdx.x; + int32_t repeats, + int32_t numLocal, + int32_t localStride, + bool emptyWarps, + bool emptyThreads) { for (uint32_t counter = 0; counter < repeats; ++counter) { - for (; index < size; index += stride) { - auto rnd = (static_cast(static_cast( - index * (counter + 1) * 1367836089)) * - size) >> - 32; - numbers[index] += lookup[rnd]; + if (emptyWarps) { + if (((threadIdx.x / 32) & 1) == 0) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size); + auto sum = lookup[rnd]; + auto limit = min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + numbers[index] += sum; + + rnd = deviceScale32((index + 32) * (counter + 1) * kPrime32, size); + sum = lookup[rnd]; + limit = min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + numbers[index + 32] += sum; + } + } + } else if (emptyThreads) { + if ((threadIdx.x & 1) == 0) { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size); + auto sum = lookup[rnd]; + auto limit = min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + numbers[index] += sum; + + rnd = deviceScale32((index + 1) * (counter + 1) * kPrime32, size); + sum = lookup[rnd]; + limit = min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + numbers[index + 1] += sum; + } + } + } else { + for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size; + index += stride) { + auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size); + auto sum = lookup[rnd]; + auto limit = min(rnd + localStride * (1 + numLocal), size); + for (auto j = rnd + localStride; j < limit; j += localStride) { + sum += lookup[j]; + } + numbers[index] += sum; + } } __syncthreads(); } + __syncthreads(); } void TestStream::addOneRandom( int32_t* numbers, const int32_t* lookup, int32_t size, - int32_t repeats) { - constexpr int32_t kWidth = 10240; + int32_t repeats, + int32_t width, + int32_t numLocal, + int32_t localStride, + bool emptyWarps, + bool emptyThreads) { constexpr int32_t kBlockSize = 256; auto numBlocks = roundUp(size, kBlockSize) / kBlockSize; int32_t stride = size; - if (numBlocks > kWidth / kBlockSize) { - stride = kWidth; - numBlocks = kWidth / kBlockSize; + if (numBlocks > width / kBlockSize) { + stride = width; + numBlocks = width / kBlockSize; } addOneRandomKernel<<stream>>>( - numbers, lookup, size, stride, repeats); + numbers, + lookup, + size, + stride, + repeats, + numLocal, + localStride, + emptyWarps, + emptyThreads); CUDA_CHECK(cudaGetLastError()); } +REGISTER_KERNEL("addOne", addOneKernel); +REGISTER_KERNEL("addOneWide", addOneWideKernel); +REGISTER_KERNEL("addOneRandom", addOneRandomKernel); + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/CudaTest.h b/velox/experimental/wave/common/tests/CudaTest.h index 758ad39a260ed..7d5716f753eac 100644 --- a/velox/experimental/wave/common/tests/CudaTest.h +++ b/velox/experimental/wave/common/tests/CudaTest.h @@ -35,15 +35,66 @@ class TestStream : public Stream { public: // Queues a kernel to add 1 to numbers[0...size - 1]. The kernel repeats // 'repeat' times. - void addOne(int32_t* numbers, int size, int32_t repeat = 1); + void + incOne(int32_t* numbers, int size, int32_t repeat = 1, int32_t width = 10240); - void addOneWide(int32_t* numbers, int32_t size, int32_t repeat = 1); + /// Like incOne but adds idx & 31 to numbers[idx]. + void + addOne(int32_t* numbers, int size, int32_t repeat = 1, int32_t width = 10240); + void addOneWide( + int32_t* numbers, + int32_t size, + int32_t repeat = 1, + int32_t width = 10240); + + /// Like addOne but uses shared memory for intermediates, with global + /// ead/write at start/end. + void addOneShared( + int32_t* numbers, + int32_t size, + int32_t repeat = 1, + int32_t width = 10240); + + /// Like addOne but uses registers for intermediates. + void addOneReg( + int32_t* numbers, + int32_t size, + int32_t repeat = 1, + int32_t width = 10240); + + /// Increments each of 'numbers by a deterministic pseudorandom + /// increment from 'lookup'. If 'numLocal is non-0, also accesses + /// 'numLocal' adjacent positions in 'lookup' with a stride of + /// 'localStride'. If 'emptyWarps' is true, odd warps do no work + /// but still sync with the other ones with __syncthreads(). If + /// 'emptyThreads' is true, odd lanes do no work and even lanes do + /// their work instead. void addOneRandom( int32_t* numbers, const int32_t* lookup, int size, - int32_t repeat = 1); + int32_t repeat = 1, + int32_t width = 10240, + int32_t numLocal = 0, + int32_t localStride = 0, + bool emptyWarps = false, + bool emptyLanes = false); + + // Makes random lookup keys and increments, starting at 'startCount' + // columns[0] is keys. 'powerOfTwo' is the next power of two from + // 'keyRange'. If 'powerOfTwo' is 0 the key columns are set to + // zero. Otherwise the key column values are incremented by a a + // delta + index of column where delta for element 0 is startCount & + // (powerOfTwo - 1). + void makeInput( + int32_t numRows, + int32_t keyRange, + int32_t powerOfTwo, + int32_t startCount, + uint64_t* hash, + uint8_t numColumns, + int64_t** columns); }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/GpuArenaTest.cpp b/velox/experimental/wave/common/tests/GpuArenaTest.cpp index f451980020619..a3bca9a2080b4 100644 --- a/velox/experimental/wave/common/tests/GpuArenaTest.cpp +++ b/velox/experimental/wave/common/tests/GpuArenaTest.cpp @@ -155,3 +155,23 @@ TEST_F(GpuArenaTest, buffers) { buffers.clear(); EXPECT_EQ(1, arena->slabs().size()); } + +TEST_F(GpuArenaTest, views) { + auto arena = std::make_unique(1 << 20, allocator_.get()); + WaveBufferPtr buffer = arena->allocate(1024); + EXPECT_EQ(1, buffer->refCount()); + WaveBufferPtr view = WaveBufferView::create( + buffer->as() + 10, 10, buffer); + EXPECT_EQ(2, buffer->refCount()); + EXPECT_EQ(1, view->refCount()); + auto view2 = view; + EXPECT_EQ(2, buffer->refCount()); + EXPECT_EQ(2, view->refCount()); + auto raw = buffer.get(); + buffer = nullptr; + EXPECT_EQ(1, raw->refCount()); + view = nullptr; + view2 = nullptr; + // This is reference to freed but the header is still in the arena. + EXPECT_EQ(0, raw->refCount()); +} diff --git a/velox/experimental/wave/common/tests/HashTableTest.cpp b/velox/experimental/wave/common/tests/HashTableTest.cpp new file mode 100644 index 0000000000000..0e5662911411c --- /dev/null +++ b/velox/experimental/wave/common/tests/HashTableTest.cpp @@ -0,0 +1,386 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/common/time/Timer.h" +#include "velox/experimental/wave/common/Buffer.h" +#include "velox/experimental/wave/common/GpuArena.h" +#include "velox/experimental/wave/common/tests/BlockTest.h" +#include "velox/experimental/wave/common/tests/CpuTable.h" +#include "velox/experimental/wave/common/tests/HashTestUtil.h" + +#include + +namespace facebook::velox::wave { + +class CpuMockGroupByOps { + public: + bool + compare(CpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) { + return row->key == reinterpret_cast(probe->keys)[0][i]; + } + + bool compare1(const CpuHashTable* table, TestingRow* row, int64_t key) { + return key == row->key; + } + + TestingRow* newRow(CpuHashTable* table, int32_t i, HashProbe* probe) { + auto row = table->newRow(); + row->key = reinterpret_cast(probe->keys)[0][i]; + row->flags = 0; + row->count = 0; + new (&row->concatenation) ArrayAgg64(); + return row; + } + + void + update(CpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) { + auto* keys = reinterpret_cast(probe->keys); + row->count += keys[1][i]; + +#if 0 + int64_t arg = keys[1][i]; + int32_t part = table->partitionIdx(bucket - table->buckets); + auto* allocator = &table->allocators[part]; + auto state = arrayAgg64Append(&row->concatenation, arg, allocator); +#endif + } +}; + +class HashTableTest : public testing::Test { + protected: + void SetUp() override { + device_ = getDevice(); + setDevice(device_); + allocator_ = getAllocator(device_); + arena_ = std::make_unique(1 << 28, allocator_); + streams_.push_back(std::make_unique()); + } + + void prefetch(Stream& stream, WaveBufferPtr buffer) { + stream.prefetch(device_, buffer->as(), buffer->capacity()); + } + + // Tests different styles of updating a group by. Results are returned in + // 'run'. + void updateTestCase(int32_t numDistinct, int32_t numRows, HashRun& run) { + run.numRows = numRows; + run.numDistinct = numDistinct; + run.numColumns = 2; + run.numRowsPerThread = 32; + + initializeHashTestInput(run, arena_.get()); + fillHashTestInput( + run.numRows, + run.numDistinct, + bits::nextPowerOfTwo(run.numDistinct), + 1, + run.numColumns, + reinterpret_cast(run.probe->keys)); + std::vector reference(run.numDistinct); + for (auto i = 0; i < run.numDistinct; ++i) { + reference[i].key = i; + } + gpuRowsBuffer_ = arena_->allocate(run.numDistinct); + TestingRow* gpuRows = gpuRowsBuffer_->as(); + memcpy(gpuRows, reference.data(), sizeof(TestingRow) * run.numDistinct); + prefetch(*streams_[0], gpuRowsBuffer_); + prefetch(*streams_[0], run.gpuData); + streams_[0]->wait(); + updateCpu(reference.data(), run); + updateGpu(gpuRows, run, reference.data()); + std::cout << run.toString() << std::endl; + } + + void updateCpu(TestingRow* rows, HashRun& run) { + uint64_t micros = 0; + { + MicrosecondTimer t(µs); + switch (run.testCase) { + case HashTestCase::kUpdateSum1: { + int64_t** keys = reinterpret_cast(run.probe->keys); + int64_t* indices = keys[0]; + int64_t* data = keys[1]; + auto numRows = run.numRows; + for (auto i = 0; i < numRows; ++i) { + rows[indices[i]].count += data[i]; + } + break; + } + default: + VELOX_FAIL("Unsupported test case"); + } + } + run.addScore("cpu1t", micros); + } + +#define UPDATE_CASE(title, func, expectCorrect, nextFlags) \ + { \ + std::cout << title << std::endl; \ + MicrosecondTimer t(µs); \ + streams_[0]->func(rows, run); \ + streams_[0]->wait(); \ + } \ + run.addScore(title, micros); \ + micros = 0; \ + compareAndReset( \ + reference, rows, run.numDistinct, title, expectCorrect, nextFlags); + + void updateGpu(TestingRow* rows, HashRun& run, TestingRow* reference) { + uint64_t micros = 0; + switch (run.testCase) { + case HashTestCase::kUpdateSum1: + UPDATE_CASE("sum1Atm", updateSum1Atomic, true, 0); + UPDATE_CASE("sum1NoSync", updateSum1NoSync, false, 0); + UPDATE_CASE("sum1AtmCoa", updateSum1AtomicCoalesce, true, 1); + UPDATE_CASE("sum1Mtx", updateSum1Mtx, true, 1); + UPDATE_CASE("sum1MtxCoa", updateSum1MtxCoalesce, true, 0); + UPDATE_CASE("sum1Part", updateSum1Part, true, 0); + UPDATE_CASE("sum1Order", updateSum1Order, true, 0); + // UPDATE_CASE("sum1Exch", updateSum1Exch, false, 0); + + break; + default: + VELOX_FAIL("Unsupported test case"); + } + } + + void compareAndReset( + TestingRow* reference, + TestingRow* rows, + int32_t numRows, + const char* title, + bool expectCorrect, + int32_t initFlags = 0) { + int32_t numError = 0; + int64_t errorSigned = 0; + int64_t errorDelta = 0; + for (auto i = 0; i < numRows; ++i) { + if (rows[i].count == reference[i].count) { + continue; + } + if (numError == 0 && expectCorrect) { + std::cout << "In " << title << std::endl; + EXPECT_EQ(reference[i].count, rows[i].count) << " at " << i; + } + ++numError; + int64_t d = reference[i].count - rows[i].count; + errorSigned += d; + errorDelta += d < 0 ? -d : d; + } + if (numError) { + std::cout << fmt::format( + "{}: numError={} errorDelta={} errorSigned={}", + title, + numError, + errorDelta, + errorSigned) + << std::endl; + } + for (auto i = 0; i < numRows; ++i) { + new (rows + i) TestingRow(); + rows[i].key = i; + rows[i].flags = initFlags; + } + prefetch(*streams_[0], gpuRowsBuffer_); + streams_[0]->wait(); + } + + void groupTestCase(int32_t numDistinct, int32_t numRows, HashRun& run) { + run.numRows = numRows; + run.numDistinct = numDistinct; + if (!run.numSlots) { + run.numSlots = bits::nextPowerOfTwo(numDistinct); + } + run.numColumns = 2; + run.numRowsPerThread = 32; + + initializeHashTestInput(run, arena_.get()); + fillHashTestInput( + run.numRows, + run.numDistinct, + bits::nextPowerOfTwo(run.numDistinct), + 1, + run.numColumns, + reinterpret_cast(run.probe->keys)); + CpuHashTable cpuTable(run.numSlots, sizeof(TestingRow) * run.numDistinct); + cpuGroupBy(cpuTable, run); + gpuGroupBy(cpuTable, run); + std::cout << run.toString() << std::endl; + } + + void cpuGroupBy(CpuHashTable& table, HashRun& run) { + uint64_t time = 0; + { + MicrosecondTimer t(&time); + int64_t* key = reinterpret_cast(run.probe->keys)[0]; + auto* hashes = run.probe->hashes; + for (auto i = 0; i < run.numRows; ++i) { + hashes[i] = bits::hashMix(1, key[i]); + } + table.updatingProbe( + run.numRows, run.probe, CpuMockGroupByOps()); + } + run.addScore("cpu1T", time); + } + + void gpuGroupBy(const CpuHashTable& reference, HashRun& run) { + WaveBufferPtr gpuTableBuffer; + GpuHashTableBase* gpuTable; + setupGpuTable( + run.numSlots, + run.numRows, + sizeof(TestingRow), + arena_.get(), + gpuTable, + gpuTableBuffer); + prefetch(*streams_[0], run.gpuData); + prefetch(*streams_[0], gpuTableBuffer); + streams_[0]->wait(); + uint64_t micros = 0; + { + MicrosecondTimer t(µs); + streams_[0]->hashTest(gpuTable, run, BlockTestStream::HashCase::kGroup); + streams_[0]->wait(); + } + run.addScore("gpu", micros); + checkGroupBy(reference, gpuTable); + } + + void checkGroupBy(const CpuHashTable& reference, GpuHashTableBase* table) { + int32_t numChecked = 0; + for (auto i = 0; i <= table->sizeMask; ++i) { + for (auto j = 0; j < 4; ++j) { + auto* row = reinterpret_cast(table->buckets)[i] + .testingLoad(j); + if (row == nullptr) { + continue; + } + ++numChecked; + auto referenceRow = reference.find( + row->key, bits::hashMix(1, row->key), CpuMockGroupByOps()); + ASSERT_TRUE(referenceRow != nullptr); + EXPECT_EQ(referenceRow->count, row->count); + } + } + EXPECT_EQ(reference.size, numChecked); + } + + Device* device_; + GpuAllocator* allocator_; + std::unique_ptr arena_; + std::vector> streams_; + WaveBufferPtr gpuRowsBuffer_; +}; + +TEST_F(HashTableTest, allocator) { + constexpr int32_t kNumThreads = 256; + constexpr int32_t kTotal = 1 << 22; + WaveBufferPtr data = arena_->allocate(kTotal); + auto* allocator = data->as(); + auto freeSetSize = BlockTestStream::freeSetSize(); + new (allocator) HashPartitionAllocator( + data->as() + sizeof(HashPartitionAllocator) + freeSetSize, + kTotal - sizeof(HashPartitionAllocator) - freeSetSize, + 16, + allocator + 1); + memset(allocator->freeSet, 0, freeSetSize); + WaveBufferPtr allResults = arena_->allocate(kNumThreads); + auto results = allResults->as(); + for (auto i = 0; i < kNumThreads; ++i) { + results[i].allocator = reinterpret_cast(allocator); + results[i].numRows = 0; + results[i].numStrings = 0; + } + auto stream1 = std::make_unique(); + auto stream2 = std::make_unique(); + stream1->initAllocator(allocator); + stream1->wait(); + stream1->rowAllocatorTest(2, 4, 3, 2, results); + stream2->rowAllocatorTest(2, 4, 3, 2, results + 128); + + stream1->wait(); + stream2->wait(); + // Pointer to result idx, position in result; + std::unordered_map uniques; + for (auto resultIdx = 0; resultIdx < kNumThreads; ++resultIdx) { + auto* result = results + resultIdx; + for (auto i = 0; i < result->numRows; ++i) { + auto row = result->rows[i]; + EXPECT_GE(reinterpret_cast(row), allocator->base); + EXPECT_LT( + reinterpret_cast(row), + allocator->base + allocator->capacity); + auto it = uniques.find(row); + EXPECT_TRUE(it == uniques.end()) << fmt::format( + "row {} is also at {} {}", + reinterpret_cast(row), + it->second >> 24, + it->second & bits::lowMask(24)); + + uniques[row] = (resultIdx << 24) | i; + } + for (auto i = 0; i < result->numStrings; ++i) { + auto string = result->strings[i]; + EXPECT_GE(reinterpret_cast(string), allocator->base); + EXPECT_LT( + reinterpret_cast(string), + allocator->base + allocator->capacity); + auto it = uniques.find(string); + EXPECT_TRUE(it == uniques.end()) << fmt::format( + "String {} is also at {} {}", + reinterpret_cast(string), + it->second >> 24, + it->second & bits::lowMask(24)); + uniques[string] = (resultIdx << 24) | i; + } + } +} + +TEST_F(HashTableTest, update) { + { + HashRun run; + run.testCase = HashTestCase::kUpdateSum1; + updateTestCase(1000, 2000000, run); + } + { + HashRun run; + run.testCase = HashTestCase::kUpdateSum1; + updateTestCase(10000000, 2000000, run); + } + { + HashRun run; + run.testCase = HashTestCase::kUpdateSum1; + updateTestCase(10, 2000000, run); + } +} + +TEST_F(HashTableTest, groupBy) { + { + HashRun run; + run.testCase = HashTestCase::kGroupSum1; + run.numSlots = 2048; + groupTestCase(1000, 2000000, run); + } + { + HashRun run; + run.testCase = HashTestCase::kGroupSum1; + run.numSlots = 8 << 20; + groupTestCase(5000000, 50000000, run); + } +} + +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/HashTestUtil.cpp b/velox/experimental/wave/common/tests/HashTestUtil.cpp new file mode 100644 index 0000000000000..60bf3a6a60b29 --- /dev/null +++ b/velox/experimental/wave/common/tests/HashTestUtil.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/wave/common/tests/HashTestUtil.h" +#include +#include "velox/common/base/BitUtil.h" +#include "velox/experimental/wave/common/Buffer.h" +#include "velox/experimental/wave/common/GpuArena.h" +#include "velox/experimental/wave/common/HashTable.h" + +namespace facebook::velox::wave { + +constexpr uint32_t kPrime32 = 1815531889; +inline uint32_t scale32(uint32_t n, uint32_t scale) { + return (static_cast(static_cast(n)) * scale) >> 32; +} + +// Returns the byte size for a GpuProbe with numRows as first, rounded row count +// as second. +std::pair probeSize(HashRun& run) { + int32_t roundedRows = + bits::roundUp(run.numRows, run.blockSize * run.numRowsPerThread); + return { + sizeof(HashProbe) + + // Column data and hash number array. + (1 + run.numColumns) * roundedRows * sizeof(int64_t) + // Pointers to column starts + + sizeof(int64_t*) * run.numColumns + // retry lists + + 3 * sizeof(int32_t) * roundedRows + + // numRows for each block. + sizeof(int32_t) * roundedRows / + (run.blockSize * run.numRowsPerThread) + + // Temp space for partitioning. + roundedRows * sizeof(int32_t) + + // alignment padding + 256, + roundedRows}; +} + +void fillHashTestInput( + int32_t numRows, + int32_t keyRange, + int32_t powerOfTwo, + int64_t counter, + uint8_t numColumns, + int64_t** columns, + int32_t numHot, + int32_t hotPct) { + int32_t delta = counter & (powerOfTwo - 1); + for (auto i = 0; i < numRows; ++i) { + auto previous = columns[0][i]; + auto seed = (previous + delta + i) * kPrime32; + if (hotPct && scale32(seed >> 32, 100) <= hotPct) { + int32_t nth = scale32(seed, numHot); + nth = std::min( + keyRange - 1, nth * (static_cast(keyRange) / nth)); + columns[0][i] = nth; + } else { + columns[0][i] = scale32(seed, keyRange); + } + } + counter += numRows; + for (auto c = 1; c < numColumns; ++c) { + for (auto r = 0; r < numRows; ++r) { + columns[c][r] = 1; // c + (r & 7); + } + } +} + +void initializeHashTestInput(HashRun& run, GpuArena* arena) { + auto [bytes, roundedRows] = probeSize(run); + if (!arena) { + run.isCpu = true; + run.cpuData = std::make_unique(bytes); + run.input = run.cpuData.get(); + } else { + run.isCpu = false; + run.gpuData = arena->allocate(bytes); + run.input = run.gpuData->as(); + } + auto data = run.input; + auto dataBegin = data; + HashProbe* probe = new (data) HashProbe(); + run.probe = probe; + data += sizeof(HashProbe); + probe->numRows = reinterpret_cast(data); + data += bits::roundUp( + sizeof(int32_t) * roundedRows / (run.numRowsPerThread * run.blockSize), + 8); + if (!arena) { + probe->numRows[0] = run.numRows; + } else { + run.numBlocks = roundedRows / (run.blockSize * run.numRowsPerThread); + for (auto i = 0; i < run.numBlocks; ++i) { + if (i == run.numBlocks - 1) { + probe->numRows[i] = + run.numRows - (i * run.blockSize * run.numRowsPerThread); + break; + } + probe->numRows[i] = run.blockSize * run.numRowsPerThread; + ; + } + } + probe->numRowsPerThread = run.numRowsPerThread; + probe->hashes = reinterpret_cast(data); + data += sizeof(uint64_t) * roundedRows; + probe->keys = data; + data += sizeof(void*) * run.numColumns; + probe->kernelRetries1 = reinterpret_cast(data); + data += sizeof(int32_t) * roundedRows; + probe->kernelRetries2 = reinterpret_cast(data); + data += sizeof(int32_t) * roundedRows; + probe->hostRetries = reinterpret_cast(data); + data += sizeof(int32_t) * roundedRows; + for (auto i = 0; i < run.numColumns; ++i) { + reinterpret_cast(probe->keys)[i] = + reinterpret_cast(data); + data += sizeof(int64_t) * roundedRows; + } + run.partitionTemp = reinterpret_cast(data); + data += bits::roundUp(sizeof(int32_t) * roundedRows, 8); + VELOX_CHECK_LE(data - dataBegin, bytes); +} + +void setupGpuTable( + int32_t numSlots, + int32_t maxRows, + int64_t rowSize, + GpuArena* arena, + GpuHashTableBase*& table, + WaveBufferPtr& buffer) { + using FreeSetType = FreeSetBase; + // GPU cache lines are 128 bytes divided in 4 separately loadable 32 byte + // sectors. + constexpr int32_t kAlignment = 128; + int32_t numBuckets = bits::nextPowerOfTwo(numSlots / 4); + int64_t bytes = sizeof(GpuHashTableBase) + sizeof(HashPartitionAllocator) + + sizeof(FreeSetType) + sizeof(GpuBucketMembers) * numBuckets + + maxRows * rowSize; + buffer = arena->allocate(bytes + kAlignment); + table = buffer->as(); + new (table) GpuHashTableBase(); + table->sizeMask = numBuckets - 1; + char* data = reinterpret_cast(table + 1); + table->allocators = reinterpret_cast(data); + auto allocatorBase = + reinterpret_cast(table->allocators); + data += sizeof(HashPartitionAllocator); + auto freeSet = reinterpret_cast(data); + new (freeSet) FreeSetType(); + data += sizeof(FreeSetType); + // The buckets start at aligned address. + data = reinterpret_cast( + bits::roundUp(reinterpret_cast(data), kAlignment)); + table->buckets = reinterpret_cast(data); + data += sizeof(GpuBucketMembers) * numBuckets; + auto allocator = reinterpret_cast(table->allocators); + new (allocator) + HashPartitionAllocator(data, maxRows * rowSize, rowSize, freeSet); + table->partitionMask = 0; + table->partitionShift = 0; + memset(table->buckets, 0, sizeof(GpuBucketMembers) * (table->sizeMask + 1)); +} + +std::string HashRun::toString() const { + std::stringstream out; + std::string opLabel = testCase == HashTestCase::kUpdateSum1 ? "update sum1" + : testCase == HashTestCase::kGroupSum1 ? "groupSum1" + : "update array_agg1"; + out << "===" << label << ":" << opLabel << " distinct=" << numDistinct + << " rows=" << numRows << " (" << numBlocks << "x" << blockSize << "x" + << numRowsPerThread << ") "; + if (hotPct) { + out << " skew " << hotPct << "% in " << numHot << " "; + } + auto sorted = scores; + std::sort(sorted.begin(), sorted.end(), [](auto& left, auto& right) { + return left.second < right.second; + }); + float gb = + numRows * sizeof(int64_t) * numColumns / static_cast(1 << 30); + for (auto& score : sorted) { + out << std::endl + << " * " + << fmt::format( + " {}={:.2f} rps {:.2f} GB/s {} us {:.2f}x", + score.first, + numRows / (score.second / 1e6), + gb / (score.second / 1e6), + score.second, + score.second / sorted[0].second); + } + return out.str(); +} + +void HashRun::addScore(const char* label, uint64_t micros) { + scores.push_back(std::make_pair(label, micros)); +} +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/HashTestUtil.h b/velox/experimental/wave/common/tests/HashTestUtil.h new file mode 100644 index 0000000000000..43703f9771274 --- /dev/null +++ b/velox/experimental/wave/common/tests/HashTestUtil.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/experimental/wave/common/Buffer.h" +#include "velox/experimental/wave/common/HashTable.h" + +namespace facebook::velox::wave { + +/// Identifies operation being tested. A collection of representative hash table +/// ops like aggregates probes and builds with different functions and layouts. +enum class HashTestCase { + // bigint sum. Update only, no hash table. + kUpdateSum1, + // group by with bigint sum. + kGroupSum1, + // array_agg of bigint. Update only, no hash table. + kUpdateArrayAgg1 +}; + +/// Describes a hashtable benchmark case. +struct HashRun { + // Label of test case. Describes what is done. The labels for different + // implementations come from 'scores'. + std::string label; + // the operation being measured. + HashTestCase testCase; + // CPU/GPU measurement. + bool isCpu; + + // Number of slots in table. + int32_t numSlots{0}; + + // Number of probe rows. + int32_t numRows; + + // Number of distinct keys. + int32_t numDistinct; + + // Number of distinct hot keys. + int32_t numHot{0}; + + // Percentage of hot keys over total keys. e.g. with 1000 distinct and 10 hot + // and hotPct of 50, every second key will be one of 10 and the rest are + // evenly spread over the 1000. + int32_t hotPct{0}; + + // Number of keys processed by each thread of each block. + int32_t numRowsPerThread; + + int32_t blockSize{256}; + + // Number of blocks of 'blockSize' threads. + int32_t numBlocks; + + // Number of columns. Key is column 0. + uint8_t numColumns{1}; + + // Number of independent hash tables. + int32_t numTables{1}; + + // Result, labeled by implementation alternative. + std::vector> scores; + + std::unique_ptr cpuData; + WaveBufferPtr gpuData; + + // Input data, either cpuData or gpuData. + char* input; + + // Initialized probe params, contained in 'input'. + HashProbe* probe; + // One int per row, use for partitioning intermediates. Uninitialized. + int32_t* partitionTemp; + + int32_t* partitionArgs; + std::string toString() const; + void addScore(const char* label, uint64_t micros); + void clearScore() { + scores.clear(); + } +}; + +void fillHashTestInput( + int32_t numRows, + int32_t keyRange, + int32_t powerOfTwo, + int64_t counter, + uint8_t numColumns, + int64_t** columns, + int32_t numHot = 0, + int32_t hotPct = 0); + +void initializeHashTestInput(HashRun& run, GpuArena* arena); + +void setupGpuTable( + int32_t numSlots, + int32_t maxRows, + int64_t rowSize, + GpuArena* arena, + GpuHashTableBase*& table, + WaveBufferPtr& buffer); + +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/tests/Updates.cuh b/velox/experimental/wave/common/tests/Updates.cuh new file mode 100644 index 0000000000000..e25b2c918be87 --- /dev/null +++ b/velox/experimental/wave/common/tests/Updates.cuh @@ -0,0 +1,285 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/experimental/wave/common/HashTable.cuh" +#include "velox/experimental/wave/common/tests/BlockTest.h" + +namespace facebook::velox::wave { + +using Mutex = cuda::binary_semaphore; + +inline void __device__ testingLock(int32_t* mtx) { + reinterpret_cast(mtx)->acquire(); +} + +inline void __device__ testingUnlock(int32_t* mtx) { + reinterpret_cast(mtx)->release(); +} + +void __device__ testSumNoSync(TestingRow* rows, HashProbe* probe) { + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto i = base + threadIdx.x; i < end; i += blockDim.x) { + auto* row = &rows[indices[i]]; + row->count += deltas[i]; + } +} + +void __device__ testSumPart( + TestingRow* rows, + int32_t numParts, + HashProbe* probe, + int32_t* part, + int32_t* partEnd, + int32_t numGroups, + int32_t groupStride) { + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + for (auto groupIdx = 0; groupIdx < numGroups; ++groupIdx) { + auto groupStart = groupIdx * groupStride; + int32_t linear = threadIdx.x + blockIdx.x * blockDim.x; + if (linear > numParts) { + break; + } + int32_t begin = linear == 0 ? groupStart + : groupStart + partEnd[groupStart + linear - 1]; + int32_t end = groupStart + partEnd[groupStart + linear]; + + for (auto i = begin; i < end; ++i) { + auto index = groupStart + part[i]; + auto* row = &rows[indices[index]]; + row->count += deltas[index]; + } + } + __syncthreads(); +} + +void __device__ testSumMtx(TestingRow* rows, HashProbe* probe) { + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto i = base + threadIdx.x; i < end; i += blockDim.x) { + auto* row = &rows[indices[i]]; + testingLock(&row->flags); + row->count += deltas[i]; + testingUnlock(&row->flags); + } +} + +void __device__ testSumAtomic(TestingRow* rows, HashProbe* probe) { + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto i = base + threadIdx.x; i < end; i += blockDim.x) { + auto* row = &rows[indices[i]]; + atomicAdd((unsigned long long*)&row->count, (unsigned long long)deltas[i]); + } +} + +void __device__ testSumAtomicCoalesce(TestingRow* rows, HashProbe* probe) { + constexpr int32_t kWarpThreads = 32; + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t lane = cub::LaneId(); + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto count = base; count < end; count += blockDim.x) { + auto i = threadIdx.x + count; + + if (i < end) { + uint32_t laneMask = count + kWarpThreads <= end + ? 0xffffffff + : lowMask(end - count); + auto index = indices[i]; + auto delta = deltas[i]; + uint32_t allPeers = __match_any_sync(laneMask, index); + int32_t leader = __ffs(allPeers) - 1; + auto peers = allPeers; + int64_t total = 0; + auto currentPeer = leader; + for (;;) { + total += __shfl_sync(allPeers, delta, currentPeer); + peers &= peers - 1; + if (peers == 0) { + break; + } + currentPeer = __ffs(peers) - 1; + } + if (lane == leader) { + auto* row = &rows[index]; + atomicAdd((unsigned long long*)&row->count, (unsigned long long)total); + } + } + } +} + +void __device__ testSumExch(TestingRow* rows, HashProbe* probe) { + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t end = base + probe->numRows[blockIdx.x]; + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + + extern __shared__ __align__(16) char smem[]; + ProbeShared* shared = reinterpret_cast(smem); + if (threadIdx.x == 0) { + shared->init(probe, base); + shared->blockEnd = end; + shared->toDo = probe->numRows[blockIdx.x]; + shared->numRounds = 0; + shared->numUpdated = 0; + shared->numTried = 0; + } + __syncthreads(); + for (;;) { + if (shared->blockEnd <= shared->blockBase) { + GPF(); + } + int32_t counter; + for (counter = base; counter < shared->blockEnd; counter += blockDim.x) { + auto i = counter + threadIdx.x; + if (i < shared->blockEnd) { + atomicAdd(&shared->numTried, 1); + if (shared->inputRetries) { + i = shared->inputRetries[i]; + } + auto* row = &rows[indices[i]]; + if (0 == + asDeviceAtomic(&row->flags) + ->exchange(1, cuda::memory_order_consume)) { + atomicAdd( + (unsigned long long*)&row->count, (unsigned long long)deltas[i]); + atomicAdd(&shared->numUpdated, 1); + asDeviceAtomic(&row->flags) + ->store(0, cuda::memory_order_release); + } else { + shared + ->outputRetries[base + atomicAdd(&shared->numKernelRetries, 1)] = + i; + } + } else { + atomicAdd(&shared->numTried, 1 << 16); + } + // __syncthreads(); + } + __syncthreads(); + if (shared->numKernelRetries == 0) { + if ((shared->numTried & 0xffff) != shared->blockEnd - shared->blockBase) { + GPF(); + } + if (shared->done + (shared->blockEnd - shared->blockBase) != + shared->toDo) { + GPF(); + } + // printf("%d %d //%d\n", base, end, counter); + return; + } + + if (threadIdx.x == 0) { + shared->done += + (shared->blockEnd - shared->blockBase) - shared->numKernelRetries; + ++shared->numRounds; + shared->numTried = 0; + shared->blockEnd = base + shared->numKernelRetries; + shared->nextRound(probe); + } + __syncthreads(); + } +} +void __device__ testSumMtxCoalesce(TestingRow* rows, HashProbe* probe) { + constexpr int32_t kWarpThreads = 32; + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t lane = cub::LaneId(); + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto count = base; count < end; count += blockDim.x) { + auto i = threadIdx.x + count; + + if (i < end) { + uint32_t laneMask = count + kWarpThreads <= end + ? 0xffffffff + : lowMask(end - count); + auto index = indices[i]; + auto delta = deltas[i]; + uint32_t allPeers = __match_any_sync(laneMask, index); + int32_t leader = __ffs(allPeers) - 1; + auto peers = allPeers; + int64_t total = 0; + auto currentPeer = leader; + for (;;) { + total += __shfl_sync(allPeers, delta, currentPeer); + peers &= peers - 1; + if (peers == 0) { + break; + } + currentPeer = __ffs(peers) - 1; + } + if (lane == leader) { + auto* row = &rows[index]; + testingLock(&row->flags); + row->count += total; + testingUnlock(&row->flags); + } + } + } +} + +void __device__ testSumOrder(TestingRow* rows, HashProbe* probe) { + auto keys = reinterpret_cast(probe->keys); + auto indices = keys[0]; + auto deltas = keys[1]; + int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x; + int32_t end = base + probe->numRows[blockIdx.x]; + + for (auto i = base + threadIdx.x; i < end; i += blockDim.x) { + auto* row = &rows[indices[i]]; + int32_t waitNano = 1; + auto d = deltas[i]; + for (;;) { + if (0 == + asDeviceAtomic(&row->flags) + ->exchange(1, cuda::memory_order_consume)) { + row->count += d; + asDeviceAtomic(&row->flags) + ->store(0, cuda::memory_order_release); + break; + } else { + __nanosleep(waitNano); + waitNano += threadIdx.x & 31; + } + } + } +} + +} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/Filter.h b/velox/experimental/wave/common/tests/Util.h similarity index 72% rename from velox/experimental/wave/exec/Filter.h rename to velox/experimental/wave/common/tests/Util.h index a78502306f07e..5b39a26587378 100644 --- a/velox/experimental/wave/exec/Filter.h +++ b/velox/experimental/wave/common/tests/Util.h @@ -16,20 +16,12 @@ #pragma once -#include "velox/experimental/wave/exec/WaveOperator.h" +#include namespace facebook::velox::wave { -class Filter : public WaveOperator { - public: - Filter(RowTypePtr inputType, exec::ExprSet exprSet); - - bool isStreaming() const override { - return true; - } - - private: - std::vector input_; -}; +inline uint32_t scale32(uint32_t n, uint32_t scale) { + return (static_cast(static_cast(n)) * scale) >> 32; +} } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/dwio/ColumnReader.cpp b/velox/experimental/wave/dwio/ColumnReader.cpp index 7534b853cf586..24162236df549 100644 --- a/velox/experimental/wave/dwio/ColumnReader.cpp +++ b/velox/experimental/wave/dwio/ColumnReader.cpp @@ -28,7 +28,8 @@ void ColumnReader::makeOp( formatData_->newBatch(readOffset_ + offset); op.action = action; op.reader = this; - op.waveVector = readStream->operandVector(operand_, requestedType_); + readStream->setNullable(*operand_, formatData_->hasNulls()); + op.waveVector = readStream->operandVector(operand_->id, requestedType_); op.rows = rows; readOffset_ = offset + rows.back() + 1; }; diff --git a/velox/experimental/wave/dwio/ColumnReader.h b/velox/experimental/wave/dwio/ColumnReader.h index 1297e1aec9fe8..8aeaa9da6f29c 100644 --- a/velox/experimental/wave/dwio/ColumnReader.h +++ b/velox/experimental/wave/dwio/ColumnReader.h @@ -31,13 +31,16 @@ class ColumnReader { ColumnReader( const TypePtr& requestedType, std::shared_ptr fileType, - OperandId operand, + AbstractOperand* operand, FormatParams& params, velox::common::ScanSpec& scanSpec) : requestedType_(requestedType), fileType_(fileType), operand_(operand), - formatData_(params.toFormatData(fileType_, scanSpec, operand)), + formatData_(params.toFormatData( + fileType_, + scanSpec, + operand ? operand->id : kNoOperand)), scanSpec_(&scanSpec) {} virtual ~ColumnReader() = default; @@ -54,7 +57,7 @@ class ColumnReader { return formatData_->totalRows(); } - OperandId operand() const { + AbstractOperand* operand() const { return operand_; } @@ -72,7 +75,7 @@ class ColumnReader { protected: TypePtr requestedType_; std::shared_ptr fileType_; - const OperandId operand_; + AbstractOperand* const operand_; std::unique_ptr formatData_; // Specification of filters, value extraction, pruning etc. The // spec is assigned at construction and the contents may change at @@ -95,6 +98,10 @@ class ReadStream : public Executable { WaveStream& waveStream, const OperandSet* firstColumns = nullptr); + void setNullable(const AbstractOperand& op, bool nullable) { + waveStream->setNullable(op, nullable); + } + /// Runs a sequence of kernel invocations until all eagerly produced columns /// have their last kernel in flight. Transfers ownership of 'readStream' to /// its WaveStream. @@ -115,11 +122,19 @@ class ReadStream : public Executable { private: /// Makes column dependencies. void makeOps(); + void makeControl(); StructColumnReader* reader_; + std::vector abstractOperands_; + + // Offset from end of previous read. int32_t offset_; + + // Row numbers to read starting after skipping 'offset_'. RowSet rows_; std::vector ops_; + // Cout of KBlockSize blocks in max top level rows. + int32_t numBlocks_{0}; std::vector> staging_; SplitStaging* currentStaging_; @@ -129,6 +144,12 @@ class ReadStream : public Executable { ResultStaging deviceStaging_; // Reusable control block for launching decode kernels. DecodePrograms programs_; + // If no filters, the starting RowSet directly initializes the BlockStatus'es + // at the end of the ReadStream. + bool hasFilters_{false}; + // Sequence number of kernel launch. + int32_t nthWave_{0}; + LaunchControl* control_{nullptr}; }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/dwio/FormatData.h b/velox/experimental/wave/dwio/FormatData.h index 9eb40dc1fe4e4..e2479dec1e9ef 100644 --- a/velox/experimental/wave/dwio/FormatData.h +++ b/velox/experimental/wave/dwio/FormatData.h @@ -64,6 +64,9 @@ class SplitStaging { id, reinterpret_cast(reinterpret_cast(pointer))); } + int64_t bytesToDevice() const { + return fill_; + } // Starts the transfers registered with add( on 'stream'). void transfer(WaveStream& waveStream, Stream& stream); diff --git a/velox/experimental/wave/dwio/ReadStream.cpp b/velox/experimental/wave/dwio/ReadStream.cpp index 891fb0a653d78..18789ff1f1d10 100644 --- a/velox/experimental/wave/dwio/ReadStream.cpp +++ b/velox/experimental/wave/dwio/ReadStream.cpp @@ -18,13 +18,21 @@ #include "velox/experimental/wave/dwio/StructColumnReader.h" namespace facebook::velox::wave { -void allOperands(const ColumnReader* reader, OperandSet& operands) { + +void allOperands( + const ColumnReader* reader, + OperandSet& operands, + std::vector* abstractOperands) { auto op = reader->operand(); - if (op != kNoOperand) { - operands.add(op); + if (op != nullptr) { + operands.add(op->id); + if (abstractOperands) { + abstractOperands->push_back(op); + } } + for (auto& child : reader->children()) { - allOperands(child, operands); + allOperands(child, operands, abstractOperands); } } @@ -36,7 +44,7 @@ ReadStream::ReadStream( const OperandSet* firstColumns) : Executable(), offset_(offset), rows_(rows) { waveStream = &_waveStream; - allOperands(columnReader, outputOperands); + allOperands(columnReader, outputOperands, &abstractOperands_); output.resize(outputOperands.size()); reader_ = columnReader; staging_.push_back(std::make_unique()); @@ -83,6 +91,16 @@ bool ReadStream::makePrograms(bool& needSync) { allDone = false; } } + if (!hasFilters_ && allDone) { + auto setCount = std::make_unique(); + setCount->step = DecodeStep::kRowCountNoFilter; + setCount->data.rowCountNoFilter.numRows = rows_.size(); + setCount->data.rowCountNoFilter.status = + control_->deviceData->as(); + programs_.programs.emplace_back(); + programs_.programs.back().push_back(std::move(setCount)); + } + ++nthWave_; resultStaging_.setReturnBuffer(waveStream->arena(), programs_); return allDone; } @@ -90,38 +108,70 @@ bool ReadStream::makePrograms(bool& needSync) { // static void ReadStream::launch(std::unique_ptr&& readStream) { using UniqueExe = std::unique_ptr; - readStream->waveStream->installExecutables( + // The function of control here is to have a status and row count for each + // kBlockSize top level rows of output and to have Operand structs for the + // produced column. + readStream->makeControl(); + auto numRows = readStream->rows_.size(); + auto waveStream = readStream->waveStream; + WaveStats& stats = waveStream->stats(); + waveStream->installExecutables( folly::Range(reinterpret_cast(&readStream), 1), [&](Stream* stream, folly::Range exes) { auto* readStream = reinterpret_cast(exes[0]); bool needSync = false; for (;;) { bool done = readStream->makePrograms(needSync); - readStream->currentStaging_->transfer( - *readStream->waveStream, *stream); + stats.bytesToDevice += readStream->currentStaging_->bytesToDevice(); + ++stats.numKernels; + stats.numPrograms += readStream->programs_.programs.size(); + stats.numThreads += readStream->programs_.programs.size() * + std::min(readStream->rows_.size(), kBlockSize); + readStream->currentStaging_->transfer(*waveStream, *stream); if (done) { break; } WaveBufferPtr extra; launchDecode( - readStream->programs(), - &readStream->waveStream->arena(), - extra, - stream); + readStream->programs(), &waveStream->arena(), extra, stream); readStream->staging_.push_back(std::make_unique()); readStream->currentStaging_ = readStream->staging_.back().get(); if (needSync) { + waveStream->setState(WaveStream::State::kWait); stream->wait(); + readStream->waveStream->setState(WaveStream::State::kHost); + } else { + readStream->waveStream->setState(WaveStream::State::kParallel); } } + WaveBufferPtr extra; launchDecode( readStream->programs(), &readStream->waveStream->arena(), extra, stream); + readStream->waveStream->setState(WaveStream::State::kParallel); readStream->waveStream->markLaunch(*stream, *readStream); }); } +void ReadStream::makeControl() { + auto numRows = rows_.size(); + numBlocks_ = bits::roundUp(numRows, kBlockSize) / kBlockSize; + waveStream->setNumRows(numRows); + WaveStream::ExeLaunchInfo info; + waveStream->exeLaunchInfo(*this, numBlocks_, info); + auto statusBytes = sizeof(BlockStatus) * numBlocks_; + auto deviceBytes = statusBytes + info.totalBytes; + auto control = std::make_unique(0, numRows); + control->deviceData = waveStream->arena().allocate(deviceBytes); + control->status = control->deviceData->as(); + + operands = waveStream->fillOperands( + *this, control->deviceData->as() + statusBytes, info)[0]; + control_ = control.get(); + waveStream->addLaunchControl(0, std::move(control)); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/dwio/StructColumnReader.h b/velox/experimental/wave/dwio/StructColumnReader.h index d047116ff0bed..15895ac8b3135 100644 --- a/velox/experimental/wave/dwio/StructColumnReader.h +++ b/velox/experimental/wave/dwio/StructColumnReader.h @@ -25,7 +25,7 @@ class StructColumnReader : public ColumnReader { StructColumnReader( const TypePtr& requestedType, std::shared_ptr fileType, - OperandId operand, + AbstractOperand* operand, FormatParams& params, velox::common::ScanSpec& scanSpec, bool isRoot) diff --git a/velox/experimental/wave/dwio/decode/DecodeStep.h b/velox/experimental/wave/dwio/decode/DecodeStep.h index cb6178fa8e486..9251446f838d1 100644 --- a/velox/experimental/wave/dwio/decode/DecodeStep.h +++ b/velox/experimental/wave/dwio/decode/DecodeStep.h @@ -54,6 +54,7 @@ enum class DecodeStep { kMap, kFlatMap, kFlatMapNode, + kRowCountNoFilter, kUnsupported, }; @@ -192,6 +193,11 @@ struct GpuDecode { int32_t* indicesCount; }; + struct RowCountNoFilter { + int32_t numRows; + BlockStatus* status; + }; + union { Trivial trivial; MainlyConstant mainlyConstant; @@ -201,6 +207,7 @@ struct GpuDecode { RleTotalLength rleTotalLength; Rle rle; MakeScatterIndices makeScatterIndices; + RowCountNoFilter rowCountNoFilter; } data; /// Returns the amount f shared memory for standard size thread block for diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh index 62c23facac153..87f43b6f1e284 100644 --- a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh +++ b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh @@ -507,6 +507,24 @@ __device__ void makeScatterIndices(GpuDecode::MakeScatterIndices& op) { *op.indicesCount = indicesCount; } } + +template +__device__ void setRowCountNoFilter(GpuDecode::RowCountNoFilter& op) { + auto numRows = op.numRows; + auto* status = op.status; + auto numCounts = roundUp(numRows, kBlockSize) / kBlockSize; + for (auto base = 0; base < numCounts; base += kBlockSize) { + auto idx = threadIdx.x + base; + if (idx < numCounts) { + // Every thread writes a row count and errors for kBlockSize rows. All + // errors are cleared and all row counts except the last are kBlockSize. + status[idx].numRows = + idx < numCounts - 1 ? kBlockSize : numRows - idx * kBlockSize; + memset(&status[base + threadIdx.x].errors, 0, sizeof(status->errors)); + } + } +} + template __device__ void decodeSwitch(GpuDecode& op) { switch (op.step) { @@ -534,6 +552,9 @@ __device__ void decodeSwitch(GpuDecode& op) { case DecodeStep::kMakeScatterIndices: detail::makeScatterIndices(op.data.makeScatterIndices); break; + case DecodeStep::kRowCountNoFilter: + detail::setRowCountNoFilter(op.data.rowCountNoFilter); + break; default: if (threadIdx.x == 0) { printf("ERROR: Unsupported DecodeStep (with shared memory)\n"); @@ -554,6 +575,7 @@ int32_t sharedMemorySizeForDecode(DecodeStep step) { case DecodeStep::kTrivial: case DecodeStep::kDictionaryOnBitpack: case DecodeStep::kSparseBool: + case DecodeStep::kRowCountNoFilter: return 0; break; diff --git a/velox/experimental/wave/exec/AggregateFunction.h b/velox/experimental/wave/exec/AggregateFunction.h index 1d346a885f16a..e73e30be956f5 100644 --- a/velox/experimental/wave/exec/AggregateFunction.h +++ b/velox/experimental/wave/exec/AggregateFunction.h @@ -16,7 +16,6 @@ #pragma once -#include "velox/experimental/wave/exec/ErrorCode.h" #include "velox/experimental/wave/vector/Operand.h" namespace facebook::velox::wave::aggregation { diff --git a/velox/experimental/wave/exec/Aggregation.cpp b/velox/experimental/wave/exec/Aggregation.cpp index 27aae3b976e97..c26881505a12d 100644 --- a/velox/experimental/wave/exec/Aggregation.cpp +++ b/velox/experimental/wave/exec/Aggregation.cpp @@ -279,7 +279,7 @@ void Aggregation::flush(bool noMoreInput) { flushDone_.record(*flushStream_); } -int32_t Aggregation::canAdvance() { +int32_t Aggregation::canAdvance(WaveStream& stream) { if (!noMoreInput_ || finished_) { return 0; } @@ -299,12 +299,21 @@ void Aggregation::schedule(WaveStream& waveStream, int32_t maxRows) { numColumns, exec->deviceData.emplace_back()); auto* instructions = arena_->allocate( numColumns, exec->deviceData.emplace_back()); + auto numBlocks = bits::roundUp(maxRows, kBlockSize) / kBlockSize; + auto* rowStatus = + arena_->allocate(numBlocks, exec->deviceData.emplace_back()); + bzero(rowStatus, numBlocks * sizeof(BlockStatus)); + for (auto i = 0; i < numBlocks; ++i) { + rowStatus[i].numRows = + i == numBlocks - 1 ? maxRows - kBlockSize * i : kBlockSize; + } auto* status = arena_->allocate( numColumns, exec->deviceData.emplace_back()); bzero(status, numColumns * sizeof(BlockStatus)); exec->operands = arena_->allocate(numColumns, exec->deviceData.emplace_back()); exec->outputOperands = outputIds_; + exec->firstOutputOperandIdx = 0; for (int i = 0; i < numColumns; ++i) { auto column = WaveVector::create(outputType_->childAt(i), *arena_); column->resize(maxRows, false); @@ -333,6 +342,9 @@ void Aggregation::schedule(WaveStream& waveStream, int32_t maxRows) { int sharedSize = std::max( aggregation::ExtractKeys::sharedSize(), aggregation::ExtractValues::sharedSize()); + auto control = std::make_unique(id_, maxRows); + control->status = rowStatus; + waveStream.addLaunchControl(id_, std::move(control)); aggregation::call( *stream, numColumns, programs, nullptr, status, sharedSize); waveStream.markLaunch(*stream, *exes[0]); diff --git a/velox/experimental/wave/exec/Aggregation.h b/velox/experimental/wave/exec/Aggregation.h index bd208d7e745bc..41cd357532dc8 100644 --- a/velox/experimental/wave/exec/Aggregation.h +++ b/velox/experimental/wave/exec/Aggregation.h @@ -43,7 +43,7 @@ class Aggregation : public WaveOperator { void flush(bool noMoreInput) override; - int32_t canAdvance() override; + int32_t canAdvance(WaveStream& stream) override; void schedule(WaveStream& stream, int32_t maxRows) override; diff --git a/velox/experimental/wave/exec/AggregationInstructions.cu b/velox/experimental/wave/exec/AggregationInstructions.cu index ac2a341d8f8bb..1827dcb717e44 100644 --- a/velox/experimental/wave/exec/AggregationInstructions.cu +++ b/velox/experimental/wave/exec/AggregationInstructions.cu @@ -91,7 +91,7 @@ normalize(BlockInfo* block, void* idMap, Operand* key, int32_t& result) { auto* typedIdMap = reinterpret_cast*>(idMap); auto id = typedIdMap->makeId(value(key, block->base, block->shared)); if (id == -1) { - return ErrorCode::kInsuffcientMemory; + return ErrorCode::kInsufficientMemory; } assert(typedIdMap->cardinality() <= kNormalizationRadix); result = kNormalizationRadix * result + id - 1; diff --git a/velox/experimental/wave/exec/ErrorCode.h b/velox/experimental/wave/exec/ErrorCode.h index 3818c4d6fb925..d24d04713db62 100644 --- a/velox/experimental/wave/exec/ErrorCode.h +++ b/velox/experimental/wave/exec/ErrorCode.h @@ -18,23 +18,4 @@ #include "velox/experimental/wave/vector/Operand.h" -namespace facebook::velox::wave { - -/// -enum class ErrorCode : uint8_t { - // All operations completed. - kOk = 0, - - // Catchall for runtime errors. - kError, - - kInsuffcientMemory, -}; - -/// Contains a count of active lanes and a per lane error code. -struct BlockStatus { - int32_t numRows{0}; - ErrorCode errors[kBlockSize]; -}; - -} // namespace facebook::velox::wave +namespace facebook::velox::wave {} // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/ExprKernel.cu b/velox/experimental/wave/exec/ExprKernel.cu index 825b652179173..9837b79c7c9a2 100644 --- a/velox/experimental/wave/exec/ExprKernel.cu +++ b/velox/experimental/wave/exec/ExprKernel.cu @@ -16,10 +16,13 @@ #include "velox/experimental/wave/exec/ExprKernel.h" +#include #include "velox/experimental/wave/common/Block.cuh" #include "velox/experimental/wave/common/CudaUtil.cuh" #include "velox/experimental/wave/exec/WaveCore.cuh" +DEFINE_bool(kernel_gdb, false, "Run kernels sequentially for debugging"); + namespace facebook::velox::wave { template @@ -30,7 +33,7 @@ __device__ inline T opFunc_kPlus(T left, T right) { template __device__ inline void binaryOpKernel( OpFunc func, - IBinary& op, + IBinary& instr, Operand** operands, int32_t blockBase, char* shared, @@ -38,9 +41,15 @@ __device__ inline void binaryOpKernel( if (threadIdx.x >= status->numRows) { return; } - flatResult(operands, op.result, blockBase, shared) = func( - getOperand(operands, op.left, blockBase, shared), - getOperand(operands, op.right, blockBase, shared)); + T left; + T right; + if (operandOrNull(operands, instr.left, blockBase, shared, left) && + operandOrNull(operands, instr.right, blockBase, shared, right)) { + flatResult( + operands, instr.result, blockBase, shared) = func(left, right); + } else { + resultNull(operands, instr.result, blockBase, shared); + } } __device__ void filterKernel( @@ -78,13 +87,45 @@ __device__ void filterKernel( } __device__ void wrapKernel( - IWrap& wrap, + const IWrap& wrap, Operand** operands, int32_t blockBase, - int32_t& numRows) {} + int32_t numRows) { + Operand* op = operands[wrap.indices]; + auto* filterIndices = reinterpret_cast(op->base); + if (filterIndices[blockBase + numRows - 1] == numRows + blockBase - 1) { + // There is no cardinality change. + return; + } + bool rowActive = threadIdx.x < numRows; + for (auto column = 0; column < wrap.numColumns; ++column) { + int32_t newIndex; + int32_t** opIndices; + bool remap = false; + if (rowActive) { + auto opIndex = wrap.columns[column]; + auto* op = operands[opIndex]; + opIndices = &op->indices[blockBase / kBlockSize]; + remap = *opIndices != nullptr; + if (remap) { + newIndex = + (*opIndices)[filterIndices[blockBase + threadIdx.x] - blockBase]; + } else if (threadIdx.x == 0) { + *opIndices = filterIndices + blockBase; + } + } + // All threads hit this. + __syncthreads(); + if (remap) { + // remap can b true only on activ rows. + (*opIndices)[threadIdx.x] = newIndex; + } + } + __syncthreads(); +} #define BINARY_TYPES(opCode, OP) \ - case OP_MIX(opCode, ScalarType::kInt64): \ + case OP_MIX(opCode, WaveTypeKind::BIGINT): \ binaryOpKernel( \ [](auto left, auto right) { return left OP right; }, \ instruction->_.binary, \ @@ -108,9 +149,11 @@ __global__ void waveBaseKernel( auto* operands = programOperands[programIndex]; auto* status = &blockStatusArray[blockIdx.x - baseIndices[blockIdx.x]]; int32_t blockBase = (blockIdx.x - baseIndices[blockIdx.x]) * blockDim.x; - for (auto i = 0; i < program->numInstructions; ++i) { - auto instruction = program->instructions[i]; + auto instruction = program->instructions; + for (;;) { switch (instruction->opCode) { + case OpCode::kReturn: + return; case OpCode::kFilter: filterKernel( instruction->_.filter, @@ -125,7 +168,20 @@ __global__ void waveBaseKernel( break; BINARY_TYPES(OpCode::kPlus, +); + BINARY_TYPES(OpCode::kLT, <); } + ++instruction; + } +} + +int32_t instructionSharedMemory(const Instruction& instruction) { + using ScanAlgorithm = cub::BlockScan; + + switch (instruction.opCode) { + case OpCode::kFilter: + return sizeof(ScanAlgorithm::TempStorage); + default: + return 0; } } @@ -144,6 +200,9 @@ void WaveKernelStream::call( sharedSize, alias ? alias->stream()->stream : stream()->stream>>>( bases, programIdx, programs, operands, status); + if (FLAGS_kernel_gdb) { + (alias ? alias : this)->wait(); + } } } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/ExprKernel.h b/velox/experimental/wave/exec/ExprKernel.h index ec4a2b2747669..40432ebea86fc 100644 --- a/velox/experimental/wave/exec/ExprKernel.h +++ b/velox/experimental/wave/exec/ExprKernel.h @@ -29,16 +29,6 @@ /// be allocated dynamically at kernel invocation. namespace facebook::velox::wave { -/// Mixed with opcode to switch between instantiations of instructions for -/// different types. -enum class ScalarType { - kInt32, - kInt64, - kReal, - kDouble, - kString, -}; - /// Opcodes for common instruction set. First all instructions that /// do not have operand type variants, then all the ones that /// do. For type templated instructions, the case label is opcode * @@ -47,6 +37,9 @@ enum class OpCode { // First all OpCodes that have no operand type specialization. kFilter = 0, kWrap, + kLiteral, + kNegate, + kReturn, // From here, only OpCodes that have variants for scalar types. kPlus, @@ -61,9 +54,12 @@ enum class OpCode { kNE, }; +constexpr int32_t kLastScalarKind = static_cast(WaveTypeKind::HUGEINT); -#define OP_MIX(op, t) \ - static_cast(static_cast(t) + 8 * static_cast(op)) +#define OP_MIX(op, t) \ + static_cast( \ + static_cast(t) + \ + (kLastScalarKind + 1) * static_cast(op)) struct IBinary { OperandIndex left; @@ -71,9 +67,6 @@ struct IBinary { OperandIndex result; // If set, apply operation to lanes where there is a non-zero byte in this. OperandIndex predicate{kEmpty}; - // If true, inverts the meaning of 'predicate', so that the operation is - // perfformed on lanes with a zero byte bit. Xored with predicate[idx]. - uint8_t invert{0}; }; struct IFilter { @@ -84,39 +77,33 @@ struct IFilter { struct IWrap { // The indices to wrap on top of 'columns'. OperandIndex indices; - - // Number of items in 'columns', 'targetColumns', 'nuwIndices', - // 'mayShareIndices'. int32_t numColumns; // The columns to wrap. OperandIndex* columns; - // The post wrap columns. If the original is not wrapped, these - // have the base of original and indices to wrap and posssibly new - // nulls from 'newNulls'. If the original is wrapped and - // newIndices[i] is non-nullptr, the combined indices from the - // existing wrap and 'indices are stored in - // 'newIndices'. 'newIndices[i]' is the indices of - // targetColumn[i]. If 'newIndices[i]' is nullptr, the new indices - // overwrite the indices in 'column[i]' and the indices are - // referenced from targetColunns[i]'. - OperandIndex* targetColumns; - - OperandIndex* newIndices; - - // If mayShareIndices[i]' is an index of a previous entry in 'columns' and - // columns[mayshareIndices[i]] shares indices of columns[i], then - // targetColumns[i] has indices of targetColumn[mayShareIndices[i]]. If the - // wrappings were not the same, indices are obtained from newIndices[i]. - int32_t* mayShareIndices; }; +struct ILiteral { + OperandIndex literal; + OperandIndex result; + OperandIndex predicate; +}; + +struct INegate { + OperandIndex value; + OperandIndex result; + OperandIndex predicate; +}; +struct IReturn {}; + struct Instruction { OpCode opCode; union { IBinary binary; IFilter filter; IWrap wrap; + ILiteral literal; + INegate negate; } _; }; @@ -125,10 +112,13 @@ struct ThreadBlockProgram { // across the ThreadBlockPrograms. int32_t sharedMemorySize{0}; int32_t numInstructions; - - Instruction** instructions; + // Array of instructions. Ends in a kReturn. + Instruction* instructions; }; +/// Returns the shared memory size for instruction for kBlockSize. +int32_t instructionSharedMemory(const Instruction& instruction); + /// A stream for invoking ExprKernel. class WaveKernelStream : public Stream { public: diff --git a/velox/experimental/wave/exec/Instruction.h b/velox/experimental/wave/exec/Instruction.h index 18616132133c6..cb6b5653507a9 100644 --- a/velox/experimental/wave/exec/Instruction.h +++ b/velox/experimental/wave/exec/Instruction.h @@ -24,7 +24,17 @@ namespace facebook::velox::wave { /// Abstract representation of Wave instructions. These translate to a device /// side ThreadBlockProgram right before execution. +template +T addBytes(U* p, int32_t bytes) { + return reinterpret_cast(reinterpret_cast(p) + bytes); +} + +/// Represents an input/output of an instruction or WaveOperator on host. The +/// device-side Operator is made at launch time based on this. struct AbstractOperand { + static constexpr int32_t kNoConstant = ~0; + static constexpr int32_t kNoWrap = ~0; + AbstractOperand(int32_t id, const TypePtr& type, std::string label) : id(id), type(type), label(label) {} @@ -39,42 +49,108 @@ struct AbstractOperand { // Label for debugging, e.g. column name or Expr::toString output. std::string label; + // The Operand of this is nullable if the Operand at some nullableIf_ is + // nullable. + std::vector nullableIf; + // Vector with constant value, else nullptr. VectorPtr constant; // True if bits in nulls or boolean values are as a bit field. Need widening // to byte on device. bool flagsAsBits{false}; + + // Offset of the literal from the block of literals after the instructions. + // The base array in Operand will be set to 'constantOffset + end of last + // instruction'. + int32_t literalOffset{kNoConstant}; + // true if null literal. + bool literalNull{false}; + + // True if the data needs no null flags. Applies to some intermediates like + // selected rows or flags or values of compile-time known non-nulls. + bool notNull{false}; + + // True if nullability depends on the run-time nullability of Operands this + // depends on. These are in 'nullableIf'. + bool conditionalNonNull{false}; + + // if true, nullability is set in WaveStream at the time of launching. Given + // by e.g. file metadata but not set at plan time. + bool sourceNullable{false}; + + // Ordinal of the wrap instruction that first wraps this. All operands wrapped + // by the same wrap share 'Operand.indices'. All Operands that are wrapped at + // some point get indices when first created. When they get wrapped, there is + // one wrap for all Operands with the same 'wrappedAt' + int32_t wrappedAt{kNoWrap}; + + std::string toString() const; }; struct AbstractInstruction { AbstractInstruction(OpCode opCode) : opCode(opCode) {} + virtual ~AbstractInstruction() = default; + template T& as() { return *reinterpret_cast(this); } OpCode opCode; + + virtual std::string toString() const { + return fmt::format("OpCode {}", static_cast(opCode)); + } +}; + +struct AbstractReturn : public AbstractInstruction { + AbstractReturn() : AbstractInstruction(OpCode::kReturn) {} }; struct AbstractFilter : public AbstractInstruction { + AbstractFilter(AbstractOperand* flags, AbstractOperand* indices) + : AbstractInstruction(OpCode::kFilter), flags(flags), indices(indices) {} + AbstractOperand* flags; AbstractOperand* indices; + + std::string toString() const override; }; struct AbstractWrap : public AbstractInstruction { - AbstractOperand indices; + AbstractWrap(AbstractOperand* indices, int32_t id) + : AbstractInstruction(OpCode::kWrap), indices(indices), id(id) {} + AbstractOperand* indices; std::vector source; std::vector target; + const int32_t id; + // Offset of array of affected operand indices in the literals section of the + // TB program. Filled in by first pass of making the TB program. + int32_t literalOffset{-1}; + void addWrap(AbstractOperand* sourceOp, AbstractOperand* targetOp = nullptr) { - if (std::find(source.begin(), source.end(), sourceOp) != source.end()) { - return; + int newWrap = AbstractOperand::kNoWrap; + if (targetOp) { + targetOp->wrappedAt = id; + } else if (sourceOp->wrappedAt == AbstractOperand::kNoWrap) { + sourceOp->wrappedAt = id; + } + + for (auto i = 0; i < source.size(); ++i) { + // If the operand has the same wrap as another one here, do nothing. + if (source[i]->wrappedAt == sourceOp->wrappedAt || + (targetOp && target[i]->wrappedAt == targetOp->wrappedAt)) { + return; + } } source.push_back(sourceOp); target.push_back(targetOp ? targetOp : sourceOp); } + + std::string toString() const override; }; struct AbstractBinary : public AbstractInstruction { @@ -82,14 +158,49 @@ struct AbstractBinary : public AbstractInstruction { OpCode opCode, AbstractOperand* left, AbstractOperand* right, - AbstractOperand* result) - : AbstractInstruction(opCode), left(left), right(right), result(result) {} + AbstractOperand* result, + AbstractOperand* predicate = nullptr) + : AbstractInstruction(opCode), + left(left), + right(right), + result(result), + predicate(predicate) {} AbstractOperand* left; AbstractOperand* right; AbstractOperand* result; - AbstractOperand* predicate{nullptr}; - bool invert{false}; + AbstractOperand* predicate; + + std::string toString() const override; +}; + +struct AbstractLiteral : public AbstractInstruction { + AbstractLiteral( + const VectorPtr& constant, + AbstractOperand* result, + AbstractOperand* predicate) + : AbstractInstruction(OpCode::kLiteral), + constant(constant), + result(result), + predicate(predicate) {} + VectorPtr constant; + AbstractOperand* result; + AbstractOperand* predicate; +}; + +struct AbstractUnary : public AbstractInstruction { + AbstractUnary( + OpCode opcode, + AbstractOperand* input, + AbstractOperand* result, + AbstractOperand* predicate = nullptr) + : AbstractInstruction(opcode), + input(input), + result(result), + predicate(predicate) {} + AbstractOperand* input; + AbstractOperand* result; + AbstractOperand* predicate; }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/Project.cpp b/velox/experimental/wave/exec/Project.cpp index c9cbd8edfb4e8..ac0cb5aba90dc 100644 --- a/velox/experimental/wave/exec/Project.cpp +++ b/velox/experimental/wave/exec/Project.cpp @@ -21,6 +21,10 @@ namespace facebook::velox::wave { +AbstractWrap* Project::findWrap() const { + return filterWrap_; +} + void Project::schedule(WaveStream& stream, int32_t maxRows) { for (auto& level : levels_) { std::vector> exes(level.size()); @@ -35,7 +39,7 @@ void Project::schedule(WaveStream& stream, int32_t maxRows) { range, [&](Stream* out, folly::Range exes) { auto inputControl = driver_->inputControl(stream, id_); auto control = stream.prepareProgramLaunch( - id_, maxRows, exes, blocksPerExe, false, out); + id_, maxRows, exes, blocksPerExe, inputControl, out); reinterpret_cast(out)->call( out, exes.size() * blocksPerExe, @@ -53,8 +57,10 @@ void Project::finalize(CompileState& state) { for (auto& level : levels_) { for (auto& program : level) { program->prepareForDevice(state.arena()); - for (auto& pair : program->localAndOutput()) { - computedSet_.add(pair.first->id); + for (auto& pair : program->output()) { + if (true /*isProjected(id)*/) { + computedSet_.add(pair.first->id); + } } } } diff --git a/velox/experimental/wave/exec/Project.h b/velox/experimental/wave/exec/Project.h index 2a6137a831666..9e68d1ab5d0ff 100644 --- a/velox/experimental/wave/exec/Project.h +++ b/velox/experimental/wave/exec/Project.h @@ -24,8 +24,13 @@ class Project : public WaveOperator { CompileState& state, RowTypePtr outputType, std::vector operands, - std::vector> levels) - : WaveOperator(state, outputType, ""), levels_(std::move(levels)) {} + std::vector> levels, + AbstractWrap* filterWrap = nullptr) + : WaveOperator(state, outputType, ""), + levels_(std::move(levels)), + filterWrap_(filterWrap) {} + + AbstractWrap* findWrap() const override; bool isStreaming() const override { return true; @@ -38,7 +43,7 @@ class Project : public WaveOperator { void finalize(CompileState& state) override; std::string toString() const override { - return "Project"; + return fmt::format("Project {}", WaveOperator::toString()); } const OperandSet& syncSet() const override { @@ -48,6 +53,7 @@ class Project : public WaveOperator { private: std::vector> levels_; OperandSet computedSet_; + AbstractWrap* filterWrap_{nullptr}; }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/TableScan.cpp b/velox/experimental/wave/exec/TableScan.cpp index 3ae6c04509ac5..0c1d78b3e3421 100644 --- a/velox/experimental/wave/exec/TableScan.cpp +++ b/velox/experimental/wave/exec/TableScan.cpp @@ -186,6 +186,7 @@ bool TableScan::isFinished() const { } void TableScan::addDynamicFilter( + const core::PlanNodeId& producer, column_index_t outputChannel, const std::shared_ptr& filter) { if (dataSource_) { diff --git a/velox/experimental/wave/exec/TableScan.h b/velox/experimental/wave/exec/TableScan.h index 81b925ea8449a..b00d53e3d3a4f 100644 --- a/velox/experimental/wave/exec/TableScan.h +++ b/velox/experimental/wave/exec/TableScan.h @@ -25,13 +25,16 @@ namespace facebook::velox::wave { -class TableScan : public WaveOperator { +class TableScan : public WaveSourceOperator { public: TableScan( CompileState& state, int32_t operatorId, const core::TableScanNode& tableScanNode) - : WaveOperator(state, tableScanNode.outputType(), tableScanNode.id()), + : WaveSourceOperator( + state, + tableScanNode.outputType(), + tableScanNode.id()), tableHandle_(tableScanNode.tableHandle()), columnHandles_(tableScanNode.assignments()), driverCtx_(state.driver().driverCtx()), @@ -47,11 +50,11 @@ class TableScan : public WaveOperator { connector_ = connector::getConnector(tableHandle_->connectorId()); } - int32_t canAdvance() override { + int32_t canAdvance(WaveStream& stream) override { if (!dataSource_) { return 0; } - return waveDataSource_->canAdvance(); + return waveDataSource_->canAdvance(stream); } void schedule(WaveStream& stream, int32_t maxRows = 0) override { @@ -75,6 +78,7 @@ class TableScan : public WaveOperator { } void addDynamicFilter( + const core::PlanNodeId& producer, column_index_t outputChannel, const std::shared_ptr& filter) override; diff --git a/velox/experimental/wave/exec/ToWave.cpp b/velox/experimental/wave/exec/ToWave.cpp index cae8022830a4d..6b9f69bd8d46c 100644 --- a/velox/experimental/wave/exec/ToWave.cpp +++ b/velox/experimental/wave/exec/ToWave.cpp @@ -72,21 +72,22 @@ AbstractOperand* CompileState::newOperand( const TypePtr& type, const std::string& label) { operands_.push_back( - std::make_unique(operandCounter_++, type, "")); + std::make_unique(operandCounter_++, type, label)); auto op = operands_.back().get(); return op; } -AbstractOperand* CompileState::addIdentityProjections(Value value) { +AbstractOperand* CompileState::addIdentityProjections(AbstractOperand* source) { AbstractOperand* result = nullptr; - for (auto i = 0; i < operators_.size(); ++i) { - if (auto operand = operators_[i]->defines(value)) { - result = operand; - continue; - } - if (!result) { - continue; - } + + int32_t latest = 0; + auto it = operandOperatorIndex_.find(source); + VELOX_CHECK( + it != operandOperatorIndex_.end(), + "The operand being projected through must b defined first"); + latest = it->second; + result = source; + for (auto i = latest; i < operators_.size(); ++i) { if (auto wrap = operators_[i]->findWrap()) { if (operators_[i]->isExpanding()) { auto newResult = newOperand(*result); @@ -102,16 +103,18 @@ AbstractOperand* CompileState::addIdentityProjections(Value value) { AbstractOperand* CompileState::findCurrentValue(Value value) { auto it = projectedTo_.find(value); + AbstractOperand* source; if (it == projectedTo_.end()) { auto originIt = definedBy_.find(value); if (originIt == definedBy_.end()) { return nullptr; } + source = originIt->second; // The operand is defined earlier, so must get translated through // cardinality changes. Or if it is not defined earlier, it is defined in // the WaveOperator being constructed, in which case,i.e. the operand in // 'definedBy_'. - auto projected = addIdentityProjections(value); + auto projected = addIdentityProjections(source); return projected ? projected : originIt->second; } return it->second; @@ -122,6 +125,9 @@ std::optional binaryOpCode(const Expr& expr) { if (name == "plus") { return OpCode::kPlus; } + if (name == "lt") { + return OpCode::kLT; + } return std::nullopt; } @@ -131,6 +137,17 @@ Program* CompileState::newProgram() { return program.get(); } +Program* CompileState::programOf(AbstractOperand* op, bool create) { + auto it = definedIn_.find(op); + if (it == definedIn_.end()) { + if (!create) { + return nullptr; + } + return newProgram(); + } + return it->second; +} + void CompileState::addInstruction( std::unique_ptr instruction, AbstractOperand* result, @@ -165,6 +182,32 @@ void CompileState::addInstruction( definedIn_[result] = program; } +bool maybeNotNull(const AbstractOperand* op) { + if (!op) { + return true; + } + return op->conditionalNonNull || op->notNull || op->sourceNullable; +} + +void CompileState::addNullableIf( + const AbstractOperand* op, + std::vector& nullableIf) { + for (auto id : op->nullableIf) { + if (std::find(nullableIf.begin(), nullableIf.end(), id) == + nullableIf.end()) { + nullableIf.push_back(id); + } + } +} + +void CompileState::setConditionalNullable(AbstractBinary& binary) { + if (maybeNotNull(binary.left) && maybeNotNull(binary.right)) { + binary.result->conditionalNonNull = true; + addNullableIf(binary.left, binary.result->nullableIf); + addNullableIf(binary.right, binary.result->nullableIf); + } +} + AbstractOperand* CompileState::addExpr(const Expr& expr) { auto value = toValue(expr); auto current = findCurrentValue(value); @@ -175,9 +218,23 @@ AbstractOperand* CompileState::addExpr(const Expr& expr) { if (auto* field = dynamic_cast(&expr)) { VELOX_FAIL("Should have been defined"); } else if (auto* constant = dynamic_cast(&expr)) { - VELOX_UNSUPPORTED("No constants"); + if (predicate_) { + auto result = newOperand(constant->type(), constant->toString()); + currentProgram_->add(std::make_unique( + constant->value(), result, predicate_)); + return result; + } else { + auto op = newOperand(constant->value()->type(), constant->toString()); + op->constant = constant->value(); + if (constant->value()->isNullAt(0)) { + op->literalNull = true; + } else { + op->notNull = true; + } + return op; + } } else if (dynamic_cast(&expr)) { - VELOX_UNSUPPORTED("No special forms"); + VELOX_UNSUPPORTED("No special forms: {}", expr.toString(1)); } auto opCode = binaryOpCode(expr); if (!opCode.has_value()) { @@ -188,6 +245,8 @@ AbstractOperand* CompileState::addExpr(const Expr& expr) { auto rightOp = addExpr(*expr.inputs()[1]); auto instruction = std::make_unique(opCode.value(), leftOp, rightOp, result); + setConditionalNullable(*instruction); + auto leftProgram = definedIn_[leftOp]; auto rightProgram = definedIn_[rightOp]; std::vector sources; @@ -209,6 +268,7 @@ std::vector CompileState::addExprSet( std::vector result; for (auto i = begin; i < end; ++i) { result.push_back(addExpr(*exprs[i])); + programOf(result.back())->addLabel(exprs[i]->toString(true)); } return result; } @@ -217,7 +277,7 @@ std::vector> CompileState::makeLevels( int32_t startIndex) { std::vector> levels; folly::F14FastSet toAdd; - for (auto i = 0; i < allPrograms_.size(); ++i) { + for (auto i = startIndex; i < allPrograms_.size(); ++i) { toAdd.insert(allPrograms_[i].get()); } while (!toAdd.empty()) { @@ -254,23 +314,61 @@ int32_t findOutputChannel( VELOX_FAIL("Expr without output channel"); } +void CompileState::addFilter(const Expr& expr, const RowTypePtr& outputType) { + int32_t numPrograms = allPrograms_.size(); + auto condition = addExpr(expr); + auto indices = newOperand(INTEGER(), "indices"); + indices->notNull = true; + auto program = programOf(condition); + program->addLabel(expr.toString(true)); + program->markOutput(indices->id); + program->add(std::make_unique(condition, indices)); + auto wrapUnique = std::make_unique(indices, wrapCounter_++); + auto wrap = wrapUnique.get(); + program->add(std::move(wrapUnique)); + auto levels = makeLevels(numPrograms); + operators_.push_back(std::make_unique( + *this, outputType, std::vector{}, levels, wrap)); +} + void CompileState::addFilterProject( exec::Operator* op, - RowTypePtr outputType, + RowTypePtr& outputType, int32_t& nodeIndex) { auto filterProject = reinterpret_cast(op); + outputType = driverFactory_.planNodes[nodeIndex]->outputType(); auto data = filterProject->exprsAndProjection(); - VELOX_CHECK(!data.hasFilter); + auto& identityProjections = filterProject->identityProjections(); + int32_t firstProjection = 0; + if (data.hasFilter) { + addFilter(*data.exprs->exprs()[0], outputType); + firstProjection = 1; + ++nodeIndex; + outputType = driverFactory_.planNodes[nodeIndex]->outputType(); + } int32_t numPrograms = allPrograms_.size(); - auto operands = addExprSet(*data.exprs, 0, data.exprs->exprs().size()); + auto operands = + addExprSet(*data.exprs, firstProjection, data.exprs->exprs().size()); + std::vector> pairs; for (auto i = 0; i < operands.size(); ++i) { - int32_t channel = findOutputChannel(*data.resultProjections, i); + int32_t channel = + findOutputChannel(*data.resultProjections, i + firstProjection); auto subfield = toSubfield(outputType->nameOf(channel)); - definedBy_[Value(subfield)] = operands[i]; + auto program = programOf(operands[i], false); + if (program) { + program->markOutput(operands[i]->id); + definedIn_[operands[i]] = program; + } + Value value(subfield); + definedBy_[value] = operands[i]; + pairs.push_back(std::make_pair(value, operands[i])); } auto levels = makeLevels(numPrograms); operators_.push_back( std::make_unique(*this, outputType, operands, levels)); + for (auto& [value, operand] : pairs) { + operators_.back()->defined(value, operand); + } } bool CompileState::reserveMemory() { @@ -314,8 +412,6 @@ bool CompileState::addOperator( if (!reserveMemory()) { return false; } - - outputType = driverFactory_.planNodes[nodeIndex]->outputType(); addFilterProject(op, outputType, nodeIndex); } else if (name == "Aggregation") { if (!reserveMemory()) { @@ -346,9 +442,11 @@ bool CompileState::addOperator( bool isProjectedThrough( const std::vector& projectedThrough, - int32_t i) { + int32_t i, + int32_t& inputChannel) { for (auto& projection : projectedThrough) { if (projection.outputChannel == i) { + inputChannel = projection.inputChannel; return true; } } @@ -366,20 +464,52 @@ bool CompileState::compile() { // Make sure operator states are initialized. We will need to inspect some of // them during the transformation. driver_.initializeOperators(); + RowTypePtr inputType; for (; operatorIndex < operators.size(); ++operatorIndex) { + int32_t previousNumOperators = operators_.size(); + auto& identity = operators[operatorIndex]->identityProjections(); + // The columns that are projected through are renamed. They may also get an + // indirection after the new operator is placed. + std::vector> identityProjected; + for (auto& projection : identity) { + identityProjected.push_back(std::make_pair( + findCurrentValue( + Value(toSubfield(inputType->nameOf(projection.inputChannel)))), + projection.outputChannel)); + } if (!addOperator(operators[operatorIndex], nodeIndex, outputType)) { break; } ++nodeIndex; - auto& identity = operators[operatorIndex]->identityProjections(); - for (auto i = 0; i < outputType->size(); ++i) { - Value value = Value(toSubfield(outputType->nameOf(i))); - if (isProjectedThrough(identity, i)) { - continue; + for (auto newIndex = previousNumOperators; newIndex < operators_.size(); + ++newIndex) { + for (auto i = 0; i < outputType->size(); ++i) { + auto& name = outputType->nameOf(i); + Value value = Value(toSubfield(name)); + int32_t inputChannel; + if (isProjectedThrough(identity, i, inputChannel)) { + continue; + } + auto operand = operators_[newIndex]->defines(value); + if (!operand && + (operators_[newIndex]->isSource() || + !operators_[newIndex]->isStreaming())) { + operand = operators_[newIndex]->definesSubfield( + *this, outputType->childAt(i), name, newIndex == 0); + } + if (operand) { + operators_[newIndex]->addOutputId(operand->id); + definedBy_[value] = operand; + operandOperatorIndex_[operand] = operators_.size() - 1; + } } - auto operand = operators_.back()->defines(value); - definedBy_[value] = operand; } + for (auto& [op, channel] : identityProjected) { + Value value(toSubfield(outputType->nameOf(channel))); + auto newOp = addIdentityProjections(op); + projectedTo_[value] = newOp; + } + inputType = outputType; } if (operators_.empty()) { return false; diff --git a/velox/experimental/wave/exec/ToWave.h b/velox/experimental/wave/exec/ToWave.h index 814943f85325b..d90ff1639c8f6 100644 --- a/velox/experimental/wave/exec/ToWave.h +++ b/velox/experimental/wave/exec/ToWave.h @@ -53,7 +53,7 @@ class CompileState { Value toValue(const exec::Expr& expr); - AbstractOperand* addIdentityProjections(Value value); + AbstractOperand* addIdentityProjections(AbstractOperand* source); AbstractOperand* findCurrentValue(Value value); AbstractOperand* addExpr(const exec::Expr& expr); @@ -82,9 +82,11 @@ class CompileState { bool addOperator(exec::Operator* op, int32_t& nodeIndex, RowTypePtr& outputType); + void addFilter(const exec::Expr& expr, const RowTypePtr& outputType); + void addFilterProject( exec::Operator* op, - RowTypePtr outputType, + RowTypePtr& outputType, int32_t& nodeIndex); bool reserveMemory(); @@ -101,17 +103,26 @@ class CompileState { const AbstractOperand* result, const std::vector& inputs); + void setConditionalNullable(AbstractBinary& binary); + + void addNullableIf( + const AbstractOperand* op, + std::vector& nullableIf); + + Program* programOf(AbstractOperand* op, bool create = true); + const std::shared_ptr& aggregateFunctionRegistry(); std::unique_ptr arena_; // The operator and output operand where the Value is first defined. - folly::F14FastMap - definedBy_; + DefinesMap definedBy_; // The Operand where Value is available after all projections placed to date. - folly::F14FastMap - projectedTo_; + DefinesMap projectedTo_; + + // Index of WaveOperator producing the operand. + folly::F14FastMap operandOperatorIndex_; folly::F14FastMap definedIn_; @@ -130,8 +141,12 @@ class CompileState { // The program being generated. std::shared_ptr currentProgram_; + // Boolean to select the instruction. Set for conditionl sections. + AbstractOperand* predicate_{nullptr}; + // Sequence number for operands. int32_t operandCounter_{0}; + int32_t wrapCounter_{0}; std::shared_ptr aggregateFunctionRegistry_; diff --git a/velox/experimental/wave/exec/Values.cpp b/velox/experimental/wave/exec/Values.cpp index e78f0a791a2f8..76cfd20d2ca2d 100644 --- a/velox/experimental/wave/exec/Values.cpp +++ b/velox/experimental/wave/exec/Values.cpp @@ -21,11 +21,11 @@ namespace facebook::velox::wave { Values::Values(CompileState& state, const core::ValuesNode& values) - : WaveOperator(state, values.outputType(), values.id()), + : WaveSourceOperator(state, values.outputType(), values.id()), values_(values.values()), roundsLeft_(values.repeatTimes()) {} -int32_t Values::canAdvance() { +int32_t Values::canAdvance(WaveStream& stream) { if (current_ < values_.size()) { return values_[current_]->size(); } @@ -52,10 +52,16 @@ void Values::schedule(WaveStream& stream, int32_t maxRows) { for (auto i = 0; i < subfields_.size(); ++i) { sources.push_back(data->childAt(i).get()); } + int32_t counter = 0; + outputIds_.forEach([&](auto id) { + stream.setNullable(*stream.operandAt(id), sources[counter]->mayHaveNulls()); + ++counter; + }); folly::Range empty(nullptr, nullptr); auto numBlocks = bits::roundUp(data->size(), kBlockSize) / kBlockSize; + stream.setNumRows(data->size()); stream.prepareProgramLaunch( - id_, data->size(), empty, numBlocks, true, nullptr); + id_, data->size(), empty, numBlocks, nullptr, nullptr); vectorsToDevice( folly::Range(sources.data(), sources.size()), outputIds_, stream); } diff --git a/velox/experimental/wave/exec/Values.h b/velox/experimental/wave/exec/Values.h index b7c80229d89b6..880f46a0a0e26 100644 --- a/velox/experimental/wave/exec/Values.h +++ b/velox/experimental/wave/exec/Values.h @@ -20,11 +20,11 @@ namespace facebook::velox::wave { -class Values : public WaveOperator { +class Values : public WaveSourceOperator { public: Values(CompileState& state, const core::ValuesNode& values); - int32_t canAdvance() override; + int32_t canAdvance(WaveStream& stream) override; bool isStreaming() const override { return true; diff --git a/velox/experimental/wave/exec/Vectors.cpp b/velox/experimental/wave/exec/Vectors.cpp index 1a279055d8110..f8947ec58edf7 100644 --- a/velox/experimental/wave/exec/Vectors.cpp +++ b/velox/experimental/wave/exec/Vectors.cpp @@ -119,18 +119,8 @@ void vectorsToDevice( transferVector( source[i], i, transfers, waveVectors, operandVector, arena, bytes); } - auto operands = arena.allocate(operandVector.size()); - memcpy( - operands->as(), - operandVector.data(), - operandVector.size() * sizeof(Operand)); - operandVector.clear(); Executable::startTransfer( - ids, - std::move(operands), - std::move(waveVectors), - std::move(transfers), - stream); + ids, std::move(waveVectors), std::move(transfers), stream); } // Patches the position 'ofet' in 'code' to be a new uninitialized device diff --git a/velox/experimental/wave/exec/Wave.cpp b/velox/experimental/wave/exec/Wave.cpp index bc2c6f3ae2807..54d51cb5095f4 100644 --- a/velox/experimental/wave/exec/Wave.cpp +++ b/velox/experimental/wave/exec/Wave.cpp @@ -19,6 +19,27 @@ namespace facebook::velox::wave { +std::string WaveTime::toString() const { + if (micros < 20) { + return fmt::format("{} ({} clocks)", succinctNanos(micros * 1000), clocks); + } + return succinctNanos(micros * 1000); +} + +void WaveStats::add(const WaveStats& other) { + numWaves += other.numWaves; + numKernels += other.numKernels; + numThreadBlocks += other.numThreadBlocks; + numPrograms += other.numPrograms; + numThreads += other.numThreads; + numSync += other.numSync; + bytesToDevice += other.bytesToDevice; + bytesToHost += other.bytesToHost; + hostOnlyTime += other.hostOnlyTime; + hostParallelTime += other.hostParallelTime; + waitTime += other.waitTime; +} + const SubfieldMap*& threadSubfieldMap() { thread_local const SubfieldMap* subfields; return subfields; @@ -35,25 +56,25 @@ std::string definesToString(const DefinesMap* map) { return out.str(); } -OperandId pathToOperand( +AbstractOperand* pathToOperand( const DefinesMap& map, std::vector>& path) { if (path.empty()) { - return kNoOperand; + return nullptr; } common::Subfield field(std::move(path)); const auto subfieldMap = threadSubfieldMap(); auto it = threadSubfieldMap()->find(field.toString()); if (it == subfieldMap->end()) { - return kNoOperand; + return nullptr; } Value value(it->second.get()); auto valueIt = map.find(value); path = std::move(field.path()); if (valueIt == map.end()) { - return kNoOperand; + return nullptr; } - return valueIt->second->id; + return valueIt->second; } WaveVector* Executable::operandVector(OperandId id) { @@ -107,6 +128,31 @@ WaveStream::~WaveStream() { } } +void WaveStream::setState(WaveStream::State state) { + if (state == state_) { + return; + } + WaveTime nowTime = WaveTime::now(); + switch (state_) { + case State::kNotRunning: + break; + case State::kHost: + stats_.hostOnlyTime += nowTime - start_; + break; + case State::kParallel: + stats_.hostParallelTime += nowTime - start_; + break; + case State::kWait: + stats_.waitTime += nowTime - start_; + break; + } + start_ = nowTime; + state_ = state; + if (state_ == State::kWait) { + ++stats_.numSync; + } +} + std::mutex WaveStream::reserveMutex_; std::vector> WaveStream::streamsForReuse_; std::vector> WaveStream::eventsForReuse_; @@ -175,6 +221,35 @@ void WaveStream::releaseEvent(std::unique_ptr&& event) { eventsForReuse_.push_back(std::move(event)); } +void WaveStream::markHostOutputOperand(const AbstractOperand& op) { + hostOutputOperands_.add(op.id); + auto nullable = isNullable(op); + auto alignment = WaveVector::alignment(op.type); + hostReturnSize_ = bits::roundUp(hostReturnSize_, alignment); + hostReturnSize_ += WaveVector::backingSize(op.type, numRows_, nullable); +} + +void WaveStream::setReturnData(bool needStatus) { + if (!needStatus && hostReturnSize_ == 0) { + return; + } +} + +void WaveStream::resultToHost() { + if (streams_.size() == 1) { + if (hostReturnDataUsed_ > 0) { + streams_[0]->deviceToHostAsync( + hostReturnData_->as(), + deviceReturnData_->as(), + hostReturnDataUsed_); + } + hostReturnEvent_ = newEvent(); + hostReturnEvent_->record(*streams_[0]); + } else { + VELOX_NYI(); + } +} + namespace { // Copies from pageable host to unified address. Multithreaded memcpy is // probably best. @@ -188,17 +263,20 @@ void copyData(std::vector& transfers) { void Executable::startTransfer( OperandSet outputOperands, - WaveBufferPtr&& operands, std::vector&& outputVectors, std::vector&& transfers, WaveStream& waveStream) { auto exe = std::make_unique(); + auto numBlocks = bits::roundUp(waveStream.numRows(), kBlockSize) / kBlockSize; + exe->waveStream = &waveStream; exe->outputOperands = outputOperands; + WaveStream::ExeLaunchInfo info; + waveStream.exeLaunchInfo(*exe, numBlocks, info); exe->output = std::move(outputVectors); exe->transfers = std::move(transfers); - exe->deviceData.push_back(operands); - exe->operands = operands->as(); - exe->outputOperands = outputOperands; + exe->deviceData.push_back(waveStream.arena().allocate(info.totalBytes)); + auto start = exe->deviceData[0]->as(); + exe->operands = waveStream.fillOperands(*exe, start, info)[0]; copyData(exe->transfers); auto* device = waveStream.device(); waveStream.installExecutables( @@ -206,6 +284,7 @@ void Executable::startTransfer( [&](Stream* stream, folly::Range executables) { for (auto& transfer : executables[0]->transfers) { stream->prefetch(device, transfer.to, transfer.size); + waveStream.stats().bytesToDevice += transfer.size; } waveStream.markLaunch(*stream, *executables[0]); }); @@ -220,9 +299,11 @@ void WaveStream::installExecutables( OperandSetHasher, OperandSetComparer> dependences; + VELOX_CHECK_NULL(hostReturnEvent_); for (auto& exeUnique : executables) { executables_.push_back(std::move(exeUnique)); auto exe = executables_.back().get(); + exe->waveStream = this; VELOX_CHECK(exe->stream == nullptr); OperandSet streamSet; exe->inputOperands.forEach([&](int32_t id) { @@ -243,13 +324,17 @@ void WaveStream::installExecutables( } // exes with no dependences go on a new stream. Streams with dependent compute - // get an event. The dependent computes ggo on new streams that first wait for + // get an event. The dependent computes go on new streams that first wait for // the events. folly::F14FastMap streamEvents; for (auto& [ids, exeVector] : dependences) { folly::Range exes(exeVector.data(), exeVector.size()); std::vector required; ids.forEach([&](int32_t id) { required.push_back(streams_[id].get()); }); + if (required.size() == 1) { + launch(required[0], exes); + continue; + } if (required.empty()) { auto stream = newStream(); launch(stream, exes); @@ -275,9 +360,12 @@ bool WaveStream::isArrived( int32_t sleepMicro, int32_t timeoutMicro) { OperandSet waitSet; + if (hostReturnEvent_) { + return hostReturnEvent_->query(); + } ids.forEach([&](int32_t id) { auto exe = operandToExecutable_[id]; - VELOX_CHECK_NOT_NULL(exe); + VELOX_CHECK_NOT_NULL(exe, "No exe produces operand {} in stream", id); if (!exe->stream) { return; } @@ -315,9 +403,161 @@ bool WaveStream::isArrived( return false; } -template -T addBytes(U* p, int32_t bytes) { - return reinterpret_cast(reinterpret_cast(p) + bytes); +void WaveStream::ensureVector( + const AbstractOperand& op, + WaveVectorPtr& vector, + int32_t numRows) { + if (!vector) { + vector = std::make_unique(op.type, arena()); + } + bool nullable = isNullable(op); + if (false /*hostOutputOperands_.contains(op.id)*/) { + VELOX_NYI(); + } else { + vector->resize(numRows < 0 ? numRows_ : numRows, nullable); + } +} + +bool WaveStream::isNullable(const AbstractOperand& op) const { + bool notNull = op.notNull; + if (!notNull) { + if (op.sourceNullable) { + notNull = !operandNullable_[op.id]; + } else { + notNull = true; + for (auto i : op.nullableIf) { + if (operandNullable_[i]) { + notNull = false; + break; + } + } + } + } + return !notNull; +} + +void WaveStream::exeLaunchInfo( + Executable& exe, + int32_t numBlocks, + ExeLaunchInfo& info) { + // The exe has an Operand* for each input/local/output/literal + // op. It has an Operand for each local/output/literal op. It has + // an array of numBlock int32_t*'s for every distinct wrapAt in + // its local/output operands where the wrapAt does not occur in + // any of the input Operands. + info.numBlocks = numBlocks; + info.numInput = exe.inputOperands.size(); + exe.inputOperands.forEach([&](auto id) { + auto op = operandAt(id); + auto* inputExe = operandExecutable(op->id); + if (op->wrappedAt != AbstractOperand::kNoWrap) { + auto* indices = inputExe->wraps[op->wrappedAt]; + VELOX_CHECK_NOT_NULL(indices); + info.inputWrap[op->wrappedAt] = indices; + } + }); + + exe.localOperands.forEach([&](auto id) { + auto op = operandAt(id); + if (op->wrappedAt != AbstractOperand::kNoWrap) { + if (info.inputWrap.find(id) == info.inputWrap.end()) { + if (info.localWrap.find(op->wrappedAt) == info.localWrap.end()) { + info.localWrap[op->wrappedAt] = reinterpret_cast( + info.localWrap.size() * numBlocks * sizeof(void*)); + } + } + } + }); + exe.outputOperands.forEach([&](auto id) { + auto op = operandAt(id); + if (op->wrappedAt != AbstractOperand::kNoWrap) { + if (info.inputWrap.find(id) == info.inputWrap.end()) { + if (info.localWrap.find(op->wrappedAt) == info.localWrap.end()) { + info.localWrap[op->wrappedAt] = reinterpret_cast( + info.localWrap.size() * numBlocks * sizeof(void*)); + } + } + } + }); + auto numLiteral = exe.literals ? exe.literals->size() : 0; + info.numLocalOps = + exe.localOperands.size() + exe.outputOperands.size() + numLiteral; + info.totalBytes = + // Pointer to Operand for input and local Operands. + sizeof(void*) * (info.numLocalOps + exe.inputOperands.size()) + + // Flat array of Operand for all but input. + sizeof(Operand) * info.numLocalOps + + // Space for the 'indices' for each distinct wrappedAt. + (info.localWrap.size() * numBlocks * sizeof(void*)); +} + +Operand** +WaveStream::fillOperands(Executable& exe, char* start, ExeLaunchInfo& info) { + Operand** operandPtrBegin = addBytes(start, 0); + exe.inputOperands.forEach([&](int32_t id) { + auto* inputExe = operandToExecutable_[id]; + int32_t ordinal = inputExe->outputOperands.ordinal(id); + *operandPtrBegin = + &inputExe->operands[inputExe->firstOutputOperandIdx + ordinal]; + ++operandPtrBegin; + }); + Operand* operandBegin = addBytes( + start, (info.numInput + info.numLocalOps) * sizeof(void*)); + int32_t* indicesBegin = + addBytes(operandBegin, info.numLocalOps * sizeof(Operand)); + for (auto& [id, ptr] : info.localWrap) { + info.localWrap[id] = + addBytes(indicesBegin, reinterpret_cast(ptr)); + } + exe.wraps = std::move(info.localWrap); + for (auto& [id, ptr] : info.inputWrap) { + exe.wraps[id] = ptr; + } + exe.intermediates.resize(exe.localOperands.size()); + int32_t fill = 0; + exe.localOperands.forEach([&](auto id) { + auto op = operandAt(id); + ensureVector(*op, exe.intermediates[fill]); + auto vec = exe.intermediates[fill].get(); + ++fill; + vec->toOperand(operandBegin); + if (op->wrappedAt != AbstractOperand::kNoWrap) { + operandBegin->indices = exe.wraps[op->wrappedAt]; + VELOX_CHECK_NOT_NULL(operandBegin->indices); + } + *operandPtrBegin = operandBegin; + ++operandPtrBegin; + ++operandBegin; + }); + exe.firstOutputOperandIdx = exe.intermediates.size(); + exe.output.resize(exe.outputOperands.size()); + fill = 0; + exe.outputOperands.forEach([&](auto id) { + auto op = operandAt(id); + ensureVector(*op, exe.output[fill]); + auto vec = exe.output[fill].get(); + ++fill; + vec->toOperand(operandBegin); + if (op->wrappedAt != AbstractOperand::kNoWrap) { + operandBegin->indices = exe.wraps[op->wrappedAt]; + VELOX_CHECK_NOT_NULL(operandBegin->indices); + } + *operandPtrBegin = operandBegin; + ++operandPtrBegin; + ++operandBegin; + }); + + auto numConstants = exe.literals ? exe.literals->size() : 0; + if (numConstants) { + memcpy(operandBegin, exe.literals->data(), numConstants * sizeof(Operand)); + for (auto i = 0; i < numConstants; ++i) { + *operandPtrBegin = operandBegin; + ++operandPtrBegin; + ++operandBegin; + } + } + + return addBytes(start, 0); } LaunchControl* WaveStream::prepareProgramLaunch( @@ -325,42 +565,41 @@ LaunchControl* WaveStream::prepareProgramLaunch( int32_t inputRows, folly::Range exes, int32_t blocksPerExe, - bool initStatus, + const LaunchControl* inputControl, Stream* stream) { static_assert(Operand::kPointersInOperand * sizeof(void*) == sizeof(Operand)); - int32_t shared = 0; // First calculate total size. // 2 int arrays: blockBase, programIdx. - int32_t numBlocks = std::min(1, exes.size()) * blocksPerExe; + int32_t numBlocks = std::max(1, exes.size()) * blocksPerExe; int32_t size = 2 * numBlocks * sizeof(int32_t); + std::vector info(exes.size()); auto exeOffset = size; // 2 pointers per exe: TB program and start of its param array. size += exes.size() * sizeof(void*) * 2; auto operandOffset = size; - // Exe dependent sizes for parameters. - int32_t numTotalOps = 0; - for (auto& exe : exes) { - markLaunch(*stream, *exe); - shared = std::max(shared, exe->programShared->sharedMemorySize()); - int32_t numIn = exe->inputOperands.size(); - int numOps = numIn + exe->intermediates.size() + exe->outputOperands.size(); - numTotalOps += numOps; - size += numOps * sizeof(void*) + (numOps - numIn) * sizeof(Operand); + // Exe dependent sizes for operands. + int32_t operandBytes = 0; + int32_t shared = 0; + for (auto i = 0; i < exes.size(); ++i) { + exeLaunchInfo(*exes[i], numBlocks, info[i]); + operandBytes += info[i].totalBytes; + markLaunch(*stream, *exes[i]); + shared = std::max(shared, exes[i]->programShared->sharedMemorySize()); } + size += operandBytes; int32_t statusOffset = 0; - if (initStatus) { + if (!inputControl) { statusOffset = size; // Pointer to return block for each tB. size += blocksPerExe * sizeof(BlockStatus); } auto buffer = arena_.allocate(size); + memset(buffer->as(), 0, size); - auto controlUnique = std::make_unique(); + auto controlUnique = std::make_unique(key, inputRows); auto& control = *controlUnique; - control.key = key; - control.inputRows = inputRows; control.sharedMemorySize = shared; // Now we fill in the various arrays and put their start addresses in // 'control'. @@ -371,11 +610,7 @@ LaunchControl* WaveStream::prepareProgramLaunch( control.programIdx, numBlocks * sizeof(int32_t)); control.operands = addBytes(control.programs, exes.size() * sizeof(void*)); - int32_t fill = 0; - Operand** operandPtrBegin = addBytes(start, operandOffset); - Operand* operandArrayBegin = - addBytes(operandPtrBegin, numTotalOps * sizeof(void*)); - if (initStatus) { + if (!inputControl) { // If the launch produces new statuses (as opposed to updating status of a // previous launch), there is an array with a status for each TB. If there // are multiple exes, they all share the same error codes. A launch can have @@ -383,93 +618,136 @@ LaunchControl* WaveStream::prepareProgramLaunch( // Writing errors is not serialized but each lane with at least one error // will show one error. control.status = addBytes(start, statusOffset); - memset(control.status, 0, blocksPerExe * sizeof(BlockStatus)); + // Memory is already set to all 0. for (auto i = 0; i < blocksPerExe; ++i) { auto status = &control.status[i]; status->numRows = i == blocksPerExe - 1 ? inputRows % kBlockSize : kBlockSize; } } else { - control.status = nullptr; - } - for (auto exeIdx = 0; exeIdx < exes.size(); ++exeIdx) { - auto exe = exes[exeIdx]; - int32_t numIn = exe->inputOperands.size(); - int32_t numLocal = exe->intermediates.size() + exe->outputOperands.size(); - control.programs[exeIdx] = exe->program; - control.operands[exeIdx] = operandPtrBegin; - // We get the actual input operands for the exe from the exes this depends - // on - exe->inputOperands.forEach([&](int32_t id) { - auto* inputExe = operandToExecutable_[id]; - int32_t ordinal = inputExe->outputOperands.ordinal(id); - *operandPtrBegin = &inputExe->operands[ordinal]; - ++operandPtrBegin; - }); - // We install the intermediates and outputs from the WaveVectors in the exe. - exe->operands = operandArrayBegin; - for (auto& vec : exe->intermediates) { - *operandPtrBegin = operandArrayBegin; - vec->toOperand(operandArrayBegin); - ++operandPtrBegin; - ++operandArrayBegin; - } - for (auto& vec : exe->output) { - *operandPtrBegin = operandArrayBegin; - vec->toOperand(operandArrayBegin); - ++operandPtrBegin; - ++operandArrayBegin; - } + control.status = inputControl->status; + } + char* operandStart = addBytes(start, operandOffset); + int32_t fill = 0; + for (auto i = 0; i < exes.size(); ++i) { + control.programs[i] = exes[i]->program; + + auto operandPtrs = fillOperands(*exes[i], operandStart, info[i]); + control.operands[i] = operandPtrs; + // The operands defined by the exe start after the input operands and are + // all consecutive. + exes[i]->operands = operandPtrs[exes[i]->inputOperands.size()]; + operandStart += info[i].totalBytes; for (auto tbIdx = 0; tbIdx < blocksPerExe; ++tbIdx) { - control.blockBase[fill] = exeIdx * blocksPerExe; - control.programIdx[fill] = exeIdx; + control.blockBase[fill] = i * blocksPerExe; + control.programIdx[fill] = i; + ++fill; } } + if (!exes.empty()) { + ++stats_.numKernels; + } + stats_.numPrograms += exes.size(); + stats_.numThreadBlocks += blocksPerExe * exes.size(); + stats_.numThreads += numRows_ * exes.size(); + control.deviceData = std::move(buffer); launchControl_[key].push_back(std::move(controlUnique)); return &control; } -void WaveStream::getOutput( +int32_t WaveStream::getOutput( + int32_t operatorId, + memory::MemoryPool& pool, folly::Range operands, - WaveVectorPtr* waveVectors) { + VectorPtr* vectors) { + auto it = launchControl_.find(operatorId); + VELOX_CHECK(it != launchControl_.end()); + auto* control = it->second[0].get(); + auto* status = control->status; + auto numBlocks = bits::roundUp(control->inputRows, kBlockSize) / kBlockSize; + if (operands.empty()) { + return statusNumRows(status, numBlocks); + } for (auto i = 0; i < operands.size(); ++i) { auto id = operands[i]; auto exe = operandExecutable(id); VELOX_CHECK_NOT_NULL(exe); auto ordinal = exe->outputOperands.ordinal(id); - waveVectors[i] = std::move(exe->output[ordinal]); - if (waveVectors[i] == nullptr) { + auto waveVectorPtr = &exe->output[ordinal]; + if (!waveVectorPtr->get()) { exe->ensureLazyArrived(operands); - waveVectors[i] = std::move(exe->output[ordinal]); - VELOX_CHECK_NOT_NULL(waveVectors[i]); + VELOX_CHECK_NOT_NULL( + waveVectorPtr->get(), "Lazy load should have filled in the result"); } + vectors[i] = waveVectorPtr->get()->toVelox( + &pool, + numBlocks, + status, + &exe->operands[exe->firstOutputOperandIdx + ordinal]); } + return vectors[0]->size(); } -ScalarType typeKindCode(TypeKind kind) { - switch (kind) { - case TypeKind::BIGINT: - return ScalarType::kInt64; - default: - VELOX_UNSUPPORTED("Bad TypeKind {}", kind); - } +WaveTypeKind typeKindCode(TypeKind kind) { + return static_cast(kind); } +#define IN_HEAD(abstract, physical, _op) \ + auto* abstractInst = &instruction->as(); \ + space->opCode = _op; \ + auto physicalInst = new (&space->_) physical(); + +#define IN_OPERAND(member) \ + physicalInst->member = operandIndex(abstractInst->member) + void Program::prepareForDevice(GpuArena& arena) { - int32_t codeSize = 0; - int32_t sharedMemorySize = 0; + VELOX_CHECK(!instructions_.empty()); + if (instructions_.back()->opCode != OpCode::kReturn) { + instructions_.push_back(std::make_unique()); + } + int32_t codeSize = sizeof(Instruction) * instructions_.size(); for (auto& instruction : instructions_) switch (instruction->opCode) { - case OpCode::kPlus: { + case OpCode::kFilter: { + auto& filter = instruction->as(); + markInput(filter.flags); + markResult(filter.indices); + + break; + } + case OpCode::kWrap: { + auto& wrap = instruction->as(); + markInput(wrap.indices); + std::vector indices(wrap.target.size()); + wrap.literalOffset = addLiteral(indices.data(), indices.size()); + for (auto i = 0; i < wrap.target.size(); ++i) { + auto target = wrap.target[i]; + markInput(wrap.source[i]); + if (target != wrap.source[i]) { + markResult(target); + } + } + break; + } + case OpCode::kPlus: + case OpCode::kLT: { auto& bin = instruction->as(); markInput(bin.left); markInput(bin.right); markResult(bin.result); markInput(bin.predicate); - codeSize += sizeof(Instruction); break; } + case OpCode::kNegate: { + auto& un = instruction->as(); + markInput(un.input); + markResult(un.result); + markInput(un.predicate); + break; + } + case OpCode::kReturn: + break; default: VELOX_UNSUPPORTED( "OpCode {}", static_cast(instruction->opCode)); @@ -477,58 +755,94 @@ void Program::prepareForDevice(GpuArena& arena) { sortSlots(); arena_ = &arena; deviceData_ = arena.allocate( - codeSize + instructions_.size() * sizeof(void*) + - sizeof(ThreadBlockProgram)); + codeSize + literalArea_.size() + sizeof(ThreadBlockProgram)); + uintptr_t end = reinterpret_cast( + deviceData_->as() + deviceData_->size()); program_ = deviceData_->as(); - auto instructionArray = addBytes(program_, sizeof(*program_)); - program_->sharedMemorySize = sharedMemorySize; + auto instructionArray = addBytes(program_, sizeof(*program_)); program_->numInstructions = instructions_.size(); program_->instructions = instructionArray; - Instruction* space = addBytes( - instructionArray, instructions_.size() * sizeof(void*)); + Instruction* space = instructionArray; + deviceLiterals_ = reinterpret_cast(space) + + sizeof(Instruction) * instructions_.size(); + VELOX_CHECK_LE( + reinterpret_cast(deviceLiterals_) + literalArea_.size(), end); + memcpy(deviceLiterals_, literalArea_.data(), literalArea_.size()); + for (auto& instruction : instructions_) { - *instructionArray = space; - ++instructionArray; switch (instruction->opCode) { - case OpCode::kPlus: { - auto& bin = instruction->as(); - auto typeCode = typeKindCode(bin.left->type->kind()); - // Comstructed on host, no vtable. - space->opCode = OP_MIX(instruction->opCode, typeCode); - new (&space->_.binary) IBinary(); - space->_.binary.left = operandIndex(bin.left); - space->_.binary.right = operandIndex(bin.right); - space->_.binary.result = operandIndex(bin.result); - ++space; + case OpCode::kPlus: + case OpCode::kLT: { + IN_HEAD( + AbstractBinary, + IBinary, + OP_MIX( + instruction->opCode, + instruction->as().left->type->kind())); + + IN_OPERAND(left); + IN_OPERAND(right); + IN_OPERAND(result); + IN_OPERAND(predicate); + break; + } + case OpCode::kFilter: { + IN_HEAD(AbstractFilter, IFilter, OpCode::kFilter); + IN_OPERAND(flags); + IN_OPERAND(indices); + break; + } + case OpCode::kWrap: { + IN_HEAD(AbstractWrap, IWrap, OpCode::kWrap); + IN_OPERAND(indices); + physicalInst->numColumns = abstractInst->source.size(); + physicalInst->columns = reinterpret_cast( + deviceLiterals_ + abstractInst->literalOffset); + for (auto i = 0; i < abstractInst->source.size(); ++i) { + physicalInst->columns[i] = operandIndex(abstractInst->source[i]); + } + break; + } + case OpCode::kReturn: { + IN_HEAD(AbstractReturn, IReturn, OpCode::kReturn); break; } default: VELOX_UNSUPPORTED("Bad OpCode"); } + sharedMemorySize_ = + std::max(sharedMemorySize_, instructionSharedMemory(*space)); + ++space; + VELOX_CHECK_LE( + reinterpret_cast(space), + reinterpret_cast(deviceLiterals_)); + } + program_->sharedMemorySize = sharedMemorySize_; + literalOperands_.resize(literal_.size()); + for (auto& [op, index] : literal_) { + literalToOperand(op, literalOperands_[index - firstLiteralIdx_]); } } -void Program::sortSlots() { - // Assigns offsets to input and local/output slots so that all - // input is first and output next and within input and output, the - // slots are ordered with lower operand id first. So, if inputs - // are slots 88 and 22 and outputs are 77 and 33, then the - // complete order is 22, 88, 33, 77. - std::vector ids; - for (auto& pair : input_) { - ids.push_back(pair.first); - } - std::sort( - ids.begin(), - ids.end(), - [](AbstractOperand*& left, AbstractOperand*& right) { - return left->id < right->id; - }); - for (auto i = 0; i < ids.size(); ++i) { - input_[ids[i]] = i; +void Program::literalToOperand(AbstractOperand* abstractOp, Operand& op) { + op.indexMask = 0; + op.indices = nullptr; + if (abstractOp->literalNull) { + op.nulls = + reinterpret_cast(deviceLiterals_ + abstractOp->literalOffset); + } else { + op.base = deviceLiterals_ + abstractOp->literalOffset; } - ids.clear(); - for (auto& pair : local_) { +} + +namespace { +// Sorts 'map' by id. Inserts back into map with second as ordinal number +// starting at 'startAt'. Returns 1 + the highest assigned number. +int32_t sortAndRenumber( + int32_t startAt, + folly::F14FastMap& map) { + std::vector ids; + for (auto& pair : map) { ids.push_back(pair.first); } std::sort( @@ -538,32 +852,114 @@ void Program::sortSlots() { return left->id < right->id; }); for (auto i = 0; i < ids.size(); ++i) { - local_[ids[i]] = i + input_.size(); + map[ids[i]] = i + startAt; } + return startAt + ids.size(); +} +} // namespace + +void Program::sortSlots() { + // Assigns offsets to input and local/output slots so that all + // input is first and output next and within input and output, the + // slots are ordered with lower operand id first. So, if inputs + // are slots 88 and 22 and outputs are 77 and 33, then the + // complete order is 22, 88, 33, 77. Constants are sorted after everything + // else. + + auto start = sortAndRenumber(0, input_); + start = sortAndRenumber(start, local_); + start = sortAndRenumber(start, output_); + firstLiteralIdx_ = start; + sortAndRenumber(start, literal_); } OperandIndex Program::operandIndex(AbstractOperand* op) const { + if (!op) { + return kEmpty; + } auto it = input_.find(op); if (it != input_.end()) { return it->second; } it = local_.find(op); - if (it == local_.end()) { - VELOX_FAIL("Bad operand, offset not known"); + if (it != local_.end()) { + return it->second; + } + it = output_.find(op); + if (it != local_.end()) { + return it->second; + } + + it = literal_.find(op); + if (it != literal_.end()) { + return it->second; + } + VELOX_FAIL("Operand not found"); +} + +template +int32_t Program::addLiteral(T* value, int32_t count) { + nextLiteral_ = bits::roundUp(nextLiteral_, sizeof(T)); + auto start = nextLiteral_; + nextLiteral_ += sizeof(T) * count; + literalArea_.resize(nextLiteral_); + memcpy(literalArea_.data() + start, value, sizeof(T) * count); + return start; +} + +template +int32_t Program::addLiteralTyped(AbstractOperand* op) { + if (op->literalOffset != AbstractOperand::kNoConstant) { + return op->literalOffset; + } + using T = typename TypeTraits::NativeType; + if (op->constant->isNullAt(0)) { + op->literalNull = true; + char zero = 0; + return op->literalOffset = addLiteral(&zero, 1); + } + T value = op->constant->as>()->valueAt(0); + if constexpr (std::is_same_v) { + int64_t inlined = 0; + StringView* stringView = reinterpret_cast(&value); + if (stringView->size() <= 6) { + int64_t inlined = static_cast(stringView->size()) << 48; + memcpy( + reinterpret_cast(&inlined) + 2, + stringView->data(), + stringView->size()); + op->literalOffset = addLiteral(&inlined, 1); + } else { + int64_t zero = 0; + op->literalOffset = addLiteral(&zero, 1); + addLiteral(stringView->data(), stringView->size()); + } + } else { + op->literalOffset = addLiteral(&value, 1); } - return it->second; + return op->literalOffset; } void Program::markInput(AbstractOperand* op) { if (!op) { return; } - if (!local_.count(op)) { + if (op->constant) { + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + addLiteralTyped, op->constant->type()->kind(), op); + literal_[op] = literal_.size(); + return; + } + if (!local_.count(op) && !output_.count(op)) { input_[op] = input_.size(); } } void Program::markResult(AbstractOperand* op) { + if (outputIds_.contains(op->id)) { + output_[op] = outputIds_.ordinal(op->id); + return; + } if (!local_.count(op)) { local_[op] = local_.size(); } @@ -588,22 +984,81 @@ std::unique_ptr Program::getExecutable( exe->inputOperands.add(pair.first->id); } for (auto& pair : local_) { + exe->localOperands.add(pair.first->id); + } + for (auto& pair : output_) { exe->outputOperands.add(pair.first->id); } - exe->output.resize(local_.size()); + + exe->literals = &literalOperands_; exe->releaser = [](std::unique_ptr& ptr) { auto program = ptr->programShared.get(); ptr->reuse(); program->releaseExe(std::move(ptr)); }; + } + return exe; +} + +std::string AbstractOperand::toString() const { + if (constant) { + return fmt::format( + "", constant->toString(0), type->toString()); + } + return fmt::format("<{}: {} {}>", id, label, type->toString()); +} - } // We have an exe, whether new or reused. Check the vectors. - int32_t nth = 0; - exe->outputOperands.forEach([&](int32_t id) { - ensureWaveVector( - exe->output[nth], operands[id]->type, maxRows, true, *arena_); - ++nth; +std::string Executable::toString() const { + std::stringstream out; + out << "{Exe produces "; + bool first = true; + outputOperands.forEach([&](auto id) { + if (!first) { + out << ", "; + }; + first = false; + out << waveStream->operandAt(id)->toString(); }); - return exe; + if (programShared) { + out << std::endl; + out << "program " << programShared->label(); + } + return out.str(); } + +std::string Program::toString() const { + std::stringstream out; + out << "{ program" << std::endl; + for (auto& instruction : instructions_) { + out << instruction->toString() << std::endl; + } + out << "}" << std::endl; + return out.str(); +} + +std::string AbstractFilter::toString() const { + return fmt::format("filter {} -> {}", flags->toString(), indices->toString()); + ; +} + +std::string AbstractWrap::toString() const { + std::stringstream out; + out << "wrap indices=" << indices->toString() << " {"; + for (auto& op : source) { + out << op->toString() << " "; + } + out << "}"; + return out.str(); +} + +std::string AbstractBinary::toString() const { + return fmt::format( + "{} = {} {} {} {}", + result->toString(), + left->toString(), + static_cast(opCode), + right->toString(), + predicate ? fmt::format(" if {}", predicate->toString()) : ""); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/Wave.h b/velox/experimental/wave/exec/Wave.h index 64754caafb14e..d9e6506e27660 100644 --- a/velox/experimental/wave/exec/Wave.h +++ b/velox/experimental/wave/exec/Wave.h @@ -28,6 +28,79 @@ namespace facebook::velox::wave { +/// A host side time point for measuring wait and launch prepare latency. Counts +/// both wall microseconds and clocks. +struct WaveTime { + size_t micros{0}; + uint64_t clocks{0}; + + static WaveTime now() { + return {getCurrentTimeMicro(), folly::hardware_timestamp()}; + } + + WaveTime operator-(const WaveTime right) const { + return {right.micros - micros, right.clocks - clocks}; + } + + WaveTime operator+(const WaveTime right) const { + return {right.micros + micros, right.clocks + clocks}; + } + void operator+=(const WaveTime& other) { + micros += other.micros; + clocks += other.clocks; + } + std::string toString() const; +}; + +class WaveTimer { + WaveTimer(WaveTime& accumulator) + : accumulator_(accumulator), start_(WaveTime::now()) {} + ~WaveTimer() { + accumulator_ = accumulator_ + (WaveTime::now() - start_); + } + + private: + WaveTime& accumulator_; + WaveTime start_; +}; + +struct WaveStats { + /// Count of WaveStreams. + int64_t numWaves{1}; + + // Count of kernel launches. + int64_t numKernels{0}; + + // Count of thread blocks in all kernel launches. + int64_t numThreadBlocks{0}; + + /// Number of programs. One launch typically has several programs, roughly one + /// per output column. + int64_t numPrograms{0}; + + /// Number of starting lanes in kernel launches. This is not exactly thread + /// blocks because the last block per program is not full. + int64_t numThreads{0}; + + /// Data transfer from host to device. + int64_t bytesToDevice{0}; + + int64_t bytesToHost{0}; + + /// Number of times the host syncs with device. + int64_t numSync{0}; + + /// Time a host thread runs without activity on device, e.g. after a sync or + /// before first launch. + WaveTime hostOnlyTime; + /// Time a host thread runs after kernel launch preparing the next kernel. + WaveTime hostParallelTime; + /// Time a host thread waits for device. + WaveTime waitTime; + + void add(const WaveStats& other); +}; + // A value a kernel can depend on. Either a dedupped exec::Expr or a dedupped // subfield. Subfield between operators, Expr inside an Expr. struct Value { @@ -38,13 +111,8 @@ struct Value { ~Value() = default; bool operator==(const Value& other) const { - if (expr == other.expr && subfield == other.subfield) { - return true; - }; - if (subfield && other.subfield && *subfield == *other.subfield) { - return true; - } - return false; + // Both exprs and subfields are deduplicated. + return expr == other.expr && subfield == other.subfield; } const exec::Expr* expr; @@ -53,6 +121,7 @@ struct Value { struct ValueHasher { size_t operator()(const Value& value) const { + // Hash the addresses because both exprs and subfields are deduplicated. return folly::hasher()( reinterpret_cast(value.subfield)) ^ folly::hasher()(reinterpret_cast(value.expr)); @@ -74,7 +143,7 @@ using DefinesMap = /// Translates a set of path steps to an OperandId or kNoOperand if /// none found. The path is not const because it is temporarily /// moved into a Subfield. Not thread safe for 'path'. -OperandId pathToOperand( +AbstractOperand* pathToOperand( const DefinesMap& map, std::vector>& path); @@ -124,7 +193,6 @@ struct Executable { /// addTransfer(). static void startTransfer( OperandSet outputOperands, - WaveBufferPtr&& operands, std::vector&& outputVectors, std::vector&& transfers, WaveStream& stream); @@ -146,8 +214,12 @@ struct Executable { void reuse() { operands = nullptr; stream = nullptr; + wraps.clear(); } - // The containing WaveStream, if needed. + + virtual std::string toString() const; + + // The containing WaveStream. WaveStream* waveStream{nullptr}; // The Program this is an invocationn of. nullptr if 'this' represents a data @@ -168,18 +240,29 @@ struct Executable { // Operand ids for outputs. OperandSet outputOperands; - // Unified memory Operand structs for intermediates/outputs. These + // Unified memory Operand structs for intermediates/outputs/literals. These // are a contiguous array of Operand in LaunchControl of 'this' Operand* operands; + // Index of first output operand in 'operands'. + int32_t firstOutputOperandIdx{-1}; + + // Map from wrapAt in AbstractOperand to device side 'indices' with one + // int32_t* per thread block. + folly::F14FastMap wraps; + + // Host side array of literals. These refer to literal data in device side + // ThreadBlockProgram. These are copied at the end of 'operands' at launch. + const std::vector* literals; + // Backing memory for intermediate Operands. Free when 'this' arrives. If // scheduling follow up work that is synchronized with arrival of 'this', the // intermediates can be moved to the dependent executable at time of // scheduling. std::vector intermediates; - // Backing device memory for 'output' Can be moved to intermediates or - // output of a dependent executables. + // Backing device memory for 'output'. These are accessed by dependent + // executables and must not be written to until out of scope. std::vector output; // If this represents data transfer, the ranges to transfer. @@ -203,6 +286,11 @@ class Program : public std::enable_shared_from_this { instructions_.push_back(std::move(instruction)); } + /// Specifies that Operand with 'id' is used by a dependent operation. + void markOutput(OperandId id) { + outputIds_.add(id); + } + const std::vector& dependsOn() const { return dependsOn_; } @@ -215,8 +303,8 @@ class Program : public std::enable_shared_from_this { dependsOn_.push_back(source); } - // Initializes executableImage and relocation information and places for - // parameters. + // Initializes executableImage and relocation information and places + // the result on device. void prepareForDevice(GpuArena& arena); std::unique_ptr getExecutable( @@ -247,11 +335,31 @@ class Program : public std::enable_shared_from_this { return sharedMemorySize_; } - const folly::F14FastMap& localAndOutput() const { - return local_; + const folly::F14FastMap& output() const { + return output_; + } + + const std::string& label() const { + return label_; + } + + void addLabel(const std::string& label) { + label_ = label_ + " " + label; } + std::string toString() const; + private: + template + int32_t addLiteralTyped(AbstractOperand* op); + /// Returns a starting offset to a constant with 'count' elements of T, + /// initialized from 'value[]' The values are copied to device side + /// ThreadBlockProgram. + template + int32_t addLiteral(T* value, int32_t count); + + void literalToOperand(AbstractOperand* abstractOp, Operand& op); + GpuArena* arena_{nullptr}; std::vector dependsOn_; DefinesMap produces_; @@ -260,7 +368,8 @@ class Program : public std::enable_shared_from_this { // Adds 'op' to 'input' if it is not produced by one in 'local' void markInput(AbstractOperand* op); - // Adds 'op' to 'local_' + + // Adds 'op' to 'local_' or 'output_'. void markResult(AbstractOperand* op); void sortSlots(); @@ -269,8 +378,28 @@ class Program : public std::enable_shared_from_this { // Input Operand to offset in operands array. folly::F14FastMap input_; - // Local/output Operand offset in operands array. + /// Set of OperandIds for outputs. These must come after intermediates in + /// Operands array. + OperandSet outputIds_; + + // Local Operand offset in operands array. folly::F14FastMap local_; + // Output Operand offset in operands array. + folly::F14FastMap output_; + + // OperandIdx for first literal operand. + int32_t firstLiteralIdx_{-1}; + + // Constant Operand to offset in operands array. + folly::F14FastMap literal_; + + // Offset of first unused constant area byte from start of constant area. + int32_t nextLiteral_{0}; + + // Binary data for constants to be embedded in ThreadBlockProgram. Must be + // relocatable, i.e. does not contain non-relative pointers within the + // constant area. + std::string literalArea_; // Owns device side 'threadBlockProgram_' WaveBufferPtr deviceData_; @@ -279,6 +408,15 @@ class Program : public std::enable_shared_from_this { ThreadBlockProgram* program_; int32_t sharedMemorySize_{0}; + + // Host side image of device side Operands that reference 'constantArea_'. + // These are copied at the end of the operand block created at kernel launch. + std::vector literalOperands_; + + std::string label_; + + // Start of device side constant area. + char* deviceLiterals_{nullptr}; // Serializes 'prepared_'. Access on WaveStrea, is single threaded but sharing // Programs across WaveDrivers makes sense, so make the preallocated resource // thread safe. @@ -295,7 +433,25 @@ struct LaunchControl; /// Represents consecutive data dependent kernel launches. class WaveStream { public: - WaveStream(GpuArena& arena) : arena_(arena) {} + /// Describes what 'this' is doing for purposes of stats collection. + enum class State { + // Not runnable, e.g. another WaveStream is being processed by WaveDriver. + kNotRunning, + // Running on host only, e.g. preparing for first kernel launch. + kHost, + // Running on host with device side work submitted. + kParallel, + // Waiting on host thread for device results. + kWait + }; + + WaveStream( + GpuArena& arena, + GpuArena& hostArena, + const std::vector>* operands) + : arena_(arena), hostArena_(hostArena), operands_(operands) { + operandNullable_.resize(operands_->size(), true); + } ~WaveStream(); @@ -310,9 +466,48 @@ class WaveStream { return arena_; } - void getOutput( + void setNullable(const AbstractOperand& op, bool nullable) { + operandNullable_[op.id] = nullable; + } + + int32_t numRows() const { + return numRows_; + } + + // Sets the size of top-level vectors to be prepared for the next launch. + void setNumRows(int32_t numRows) { + numRows_ = numRows; + } + + /// Sets 'vector' to ' a WaveVector of suitable type, size and + /// nullability. May reuse 'vector' if not nullptr. The size comes + /// from setNumRows() if not given as parameter. + void ensureVector( + const AbstractOperand& operand, + WaveVectorPtr& vector, + int32_t numRows = -1); + + /// Marks 'op' as being later copied to host. Allocates these together. + void markHostOutputOperand(const AbstractOperand& op); + + /// Finalizes return state. setNumRows and markHostOutputOperand may not be + /// called after this. If 'needStatus' is false and no columns are marked for + /// host return there is no need for any data transfer at the end of the + /// stream. + void setReturnData(bool needStatus); + + /// Enqueus copy of device side results to host. + void resultToHost(); + + /// Updates 'vectors' to reference the data in 'operands'. 'id' is the id of + /// the last WaveOperator. It identifies the LaunchControl with the final + /// BlockStatus with errors and cardinalities. Returns the number of rows + /// after possible selection. + int32_t getOutput( + int32_t operatorId, + memory::MemoryPool& pool, folly::Range operands, - WaveVectorPtr* waveVectors); + VectorPtr* vectors); Executable* operandExecutable(OperandId id) { auto it = operandToExecutable_.find(id); @@ -385,7 +580,7 @@ class WaveStream { int32_t inputRows, folly::Range exes, int32_t blocksPerExe, - bool initstatus, + const LaunchControl* inputStatus, Stream* stream); const std::vector>& launchControls( @@ -393,7 +588,46 @@ class WaveStream { return launchControl_[key]; } + void addLaunchControl(int32_t key, std::unique_ptr control) { + launchControl_[key].push_back(std::move(control)); + } + + const AbstractOperand* operandAt(int32_t id) { + VELOX_CHECK_LT(id, operands_->size()); + return (*operands_)[id].get(); + } + + // Describes an exe in a multi-program launch. + struct ExeLaunchInfo { + int32_t numBlocks; + int32_t numInput{0}; + int32_t numLocalOps{0}; + int32_t numLocalWrap{0}; + int32_t totalBytes{0}; + folly::F14FastMap inputWrap; + folly::F14FastMap localWrap; + }; + + void + exeLaunchInfo(Executable& exe, int32_t blocksPerExe, ExeLaunchInfo& info); + + Operand** fillOperands(Executable& exe, char* start, ExeLaunchInfo& info); + + /// Sets the state for stats collection. + void setState(WaveStream::State state); + + const WaveStats& stats() const { + return stats_; + } + + WaveStats& stats() { + return stats_; + } + private: + // true if 'op' is nullable in the context of 'this'. + bool isNullable(const AbstractOperand& op) const; + Event* newEvent(); static std::unique_ptr eventFromReserve(); @@ -408,15 +642,28 @@ class WaveStream { static void clearReusable(); GpuArena& arena_; + GpuArena& hostArena_; + const std::vector>* const operands_; + // True at '[i]' if in this stream 'operands_[i]' should have null flags. + std::vector operandNullable_; + + // Number of rows to allocate for top level vectors for the next kernel + // launch. + int32_t numRows_{0}; + folly::F14FastMap operandToExecutable_; std::vector> executables_; // Currently active streams, each at the position given by its // stream->userData(). std::vector> streams_; + // The most recent event recorded on the pairwise corresponding element of // 'streams_'. std::vector lastEvent_; + // If status return copy has been initiated, then this is th event to sync + // with before accessing the 'hostReturnData_' + Event* hostReturnEvent_{nullptr}; // all events recorded on any stream. Events, once seen realized, are moved // back to reserve from here. @@ -428,6 +675,34 @@ class WaveStream { launchControl_; folly::F14FastMap extraData_; + + // ids of operands that need their memory to be in the host return area. + OperandSet hostOutputOperands_; + + // Offset of the operand in 'hostReturnData_' and 'deviceReturnData_'. + folly::F14FastMap hostReturnOffset_; + + // Size of data returned at end of stream. + int64_t hostReturnSize_{0}; + + int64_t hostReturnDataUsed_{0}; + + // Device side data for all returnable data, like BlockStatus and Vector + // bodies to be copied to host. + WaveBufferPtr deviceReturnData_; + + // Host pinned memory to which 'deviceReturnData' is copied. + WaveBufferPtr hostReturnData_; + + // Pointer to statuses inside 'hostReturnData_'. + BlockStatus* hostStatus_{nullptr}; + + // Time when host side activity last started on 'this'. + WaveTime start_; + + State state_{State::kNotRunning}; + + WaveStats stats_; }; /// Describes all the control data for launching a kernel executing @@ -443,25 +718,33 @@ class WaveStream { //// WaveVectors in each exe. Array of TB return status blocks, one //// per TB. struct LaunchControl { - int32_t key; + LaunchControl(int32_t _key, int32_t _inputRows) + : key(_key), inputRows(_inputRows) {} + + // Id of the initiating operator. + const int32_t key; - int32_t inputRows; + // Number of rows the programs get as input. Initializes the BlockStatus'es on + // device in prepareProgamLaunch(). + const int32_t inputRows; - /// The first thread block with the program. - int32_t* blockBase; + /// The first thread block with the program. Subscript is blockIdx.x. + int32_t* blockBase{0}; // The ordinal of the program. All blocks with the same program have the same - // number here. - int32_t* programIdx; + // number here. Subscript is blockIdx.x. + int32_t* programIdx{nullptr}; - // The TB program for each exe. - ThreadBlockProgram** programs; + // The TB program for each exe. The subscript is programIdx[blockIdx.x]. + ThreadBlockProgram** programs{nullptr}; // For each exe, the start of the array of Operand*. Instructions reference - // operands via offset in this array.// - Operand*** operands; + // operands via offset in this array. The subscript is + // programIndx[blockIdx.x]. + Operand*** operands{nullptr}; - // the status return block for each TB. - BlockStatus* status; + // the status return block for each TB. The subscript is blockIdx.x - + // (blockBase[blockIdx.x] / kBlockSize). Shared between all programs. + BlockStatus* status{nullptr}; int32_t sharedMemorySize{0}; // Storage for all the above in a contiguous unified memory piece. diff --git a/velox/experimental/wave/exec/WaveCore.cuh b/velox/experimental/wave/exec/WaveCore.cuh index 058b89bb18627..d000cf5081315 100644 --- a/velox/experimental/wave/exec/WaveCore.cuh +++ b/velox/experimental/wave/exec/WaveCore.cuh @@ -27,8 +27,45 @@ __device__ inline T& flatValue(void* base, int32_t blockBase) { return reinterpret_cast(base)[blockBase + threadIdx.x]; } -__device__ inline bool isNull(Operand* op, int32_t blockBase) { - return op->nulls == nullptr || !op->nulls[blockBase + threadIdx.x]; +template +__device__ T& sharedMemoryOperand(char* shared, OperandIndex op) { + return reinterpret_cast( + shared + ((op & kSharedOperandMask) << 1))[blockIdx.x]; +} +/// Returns true if operand is non null. Sets 'value' to the value of the +/// operand. +template +__device__ inline bool operandOrNull( + Operand** operands, + OperandIndex opIdx, + int32_t blockBase, + char* shared, + T& value) { + if (opIdx > kMinSharedMemIndex) { + uint16_t mask = opIdx & kSharedNullMask; + if (mask > 0 && shared[kBlockSize * (mask - 1) + blockIdx.x] == kNull) { + return false; + } + value = sharedMemoryOperand(shared, opIdx); + return true; + } + auto op = operands[opIdx]; + int32_t index = threadIdx.x; + if (auto indicesInOp = op->indices) { + auto indices = indicesInOp[blockBase / kBlockSize]; + if (indices) { + index = indices[index]; + } else { + index += blockBase; + } + } else { + index = (index + blockBase) & op->indexMask; + } + if (op->nulls && op->nulls[index] == kNull) { + return false; + } + value = reinterpret_cast(op->base)[index]; + return true; } template @@ -38,8 +75,7 @@ __device__ inline T getOperand( int32_t blockBase, char* shared) { if (opIdx > kMinSharedMemIndex) { - return reinterpret_cast( - shared + opIdx - kMinSharedMemIndex)[blockIdx.x]; + return sharedMemoryOperand(shared, opIdx); } auto op = operands[opIdx]; int32_t index = (threadIdx.x + blockBase) & op->indexMask; @@ -68,6 +104,21 @@ __device__ inline T value(Operand* op, int index) { return reinterpret_cast(op->base)[index]; } +/// Sets the lane's result to null for opIdx. +__device__ inline void resultNull( + Operand** operands, + OperandIndex opIdx, + int32_t blockBase, + char* shared) { + if (opIdx >= kMinSharedMemIndex) { + auto offset = (opIdx & kSharedNullMask) - 1; + shared[(kBlockSize * offset) + blockIdx.x] = kNull; + } else { + auto* op = operands[opIdx]; + op->nulls[blockBase + threadIdx.x] = kNull; + } +} + template __device__ inline T& flatResult( Operand** operands, @@ -75,8 +126,10 @@ __device__ inline T& flatResult( int32_t blockBase, char* shared) { if (opIdx >= kMinSharedMemIndex) { - return reinterpret_cast( - shared + opIdx - kMinSharedMemIndex)[threadIdx.x]; + if (auto mask = (opIdx & kSharedNullMask)) { + shared[(kBlockSize * (mask - 1)) + blockIdx.x] = kNotNull; + } + return sharedMemoryOperand(shared, opIdx); } auto* op = operands[opIdx]; if (op->nulls) { diff --git a/velox/experimental/wave/exec/WaveDataSource.h b/velox/experimental/wave/exec/WaveDataSource.h index 6272b824ea315..ad478c002f27a 100644 --- a/velox/experimental/wave/exec/WaveDataSource.h +++ b/velox/experimental/wave/exec/WaveDataSource.h @@ -42,7 +42,7 @@ class WaveDataSource { virtual void addSplit(std::shared_ptr split) = 0; - virtual int32_t canAdvance() = 0; + virtual int32_t canAdvance(WaveStream& stream) = 0; virtual void schedule(WaveStream& stream, int32_t maxRows = 0) = 0; diff --git a/velox/experimental/wave/exec/WaveDriver.cpp b/velox/experimental/wave/exec/WaveDriver.cpp index 18347d01ea7cb..93802e64ec6ca 100644 --- a/velox/experimental/wave/exec/WaveDriver.cpp +++ b/velox/experimental/wave/exec/WaveDriver.cpp @@ -41,6 +41,9 @@ WaveDriver::WaveDriver( subfields_(std::move(subfields)), operands_(std::move(operands)) { VELOX_CHECK(!waveOperators.empty()); + auto returnBatchSize = 10000 * outputType_->size() * 10; + hostArena_ = std::make_unique( + returnBatchSize * 10, getHostAllocator(getDevice())); pipelines_.emplace_back(); for (auto& op : waveOperators) { op->setDriver(this); @@ -49,6 +52,7 @@ WaveDriver::WaveDriver( } pipelines_.back().operators.push_back(std::move(op)); } + pipelines_.back().needStatus = true; } RowVectorPtr WaveDriver::getOutput() { @@ -69,6 +73,7 @@ RowVectorPtr WaveDriver::getOutput() { ++it; continue; } + stream->setState(WaveStream::State::kNotRunning); RowVectorPtr result; if (i + 1 < pipelines_.size()) { auto waveResult = makeWaveResult(op.outputType(), *stream, lastSet); @@ -80,6 +85,7 @@ RowVectorPtr WaveDriver::getOutput() { VLOG(1) << "Final output size: " << result->size(); } if (streamAtEnd(*stream)) { + waveStats_.add(stream->stats()); it = streams.erase(it); } else { ++it; @@ -97,6 +103,7 @@ RowVectorPtr WaveDriver::getOutput() { } if (!running) { VLOG(1) << "No more output"; + updateStats(); finished_ = true; return nullptr; } @@ -127,19 +134,19 @@ RowVectorPtr WaveDriver::makeResult( const OperandSet& lastSet) { auto& last = *pipelines_.back().operators.back(); auto& rowType = last.outputType(); + auto operatorId = last.operatorId(); std::vector children(rowType->size()); + int32_t numRows = stream.getOutput( + operatorId, *operatorCtx_->pool(), resultOrder_, children.data()); auto result = std::make_shared( operatorCtx_->pool(), rowType, BufferPtr(nullptr), - last.outputSize(stream), + numRows, std::move(children)); - int32_t nthChild = 0; - std::vector waveVectors(resultOrder_.size()); - stream.getOutput(resultOrder_, waveVectors.data()); - for (auto& item : waveVectors) { - result->childAt(nthChild++) = item->toVelox(operatorCtx_->pool()); - }; + if (!numRows) { + return nullptr; + } return result; } @@ -150,25 +157,32 @@ void WaveDriver::startMore() { if (blockingReason_ != exec::BlockingReason::kNotBlocked) { return; } - if (auto rows = ops[0]->canAdvance()) { + auto stream = + std::make_unique(*arena_, *hostArena_, &operands()); + stream->setState(WaveStream::State::kHost); + + if (auto rows = ops[0]->canAdvance(*stream)) { VLOG(1) << "Advance " << rows << " rows in pipeline " << i; - auto stream = std::make_unique(*arena_); + stream->setNumRows(rows); + if (i == pipelines_.size() - 1) { + for (auto i : resultOrder_) { + stream->markHostOutputOperand(*operands_[i]); + } + } + stream->setReturnData(pipelines_[i].needStatus); for (auto& op : ops) { op->schedule(*stream, rows); } - if (i == pipelines_.size() - 1) { - prefetchReturn(*stream); + if (pipelines_[i].needStatus) { + stream->resultToHost(); } + stream->setState(WaveStream::State::kNotRunning); pipelines_[i].streams.push_back(std::move(stream)); break; } } } -void WaveDriver::prefetchReturn(WaveStream& stream) { - // Schedule return buffers from last op to be on host side. -} - LaunchControl* WaveDriver::inputControl( WaveStream& stream, int32_t operatorId) { @@ -200,4 +214,39 @@ std::string WaveDriver::toString() const { return out.str(); } +void WaveDriver::updateStats() { + auto lockedStats = stats_.wlock(); + lockedStats->addRuntimeStat( + "wave.numWaves", RuntimeCounter(waveStats_.numWaves)); + lockedStats->addRuntimeStat( + "wave.numKernels", RuntimeCounter(waveStats_.numKernels)); + lockedStats->addRuntimeStat( + "wave.numThreadBlocks", RuntimeCounter(waveStats_.numThreadBlocks)); + lockedStats->addRuntimeStat( + "wave.numThreads", RuntimeCounter(waveStats_.numThreads)); + lockedStats->addRuntimeStat( + "wave.numPrograms", RuntimeCounter(waveStats_.numPrograms)); + lockedStats->addRuntimeStat( + "wave.numSync", RuntimeCounter(waveStats_.numSync)); + lockedStats->addRuntimeStat( + "wave.bytesToDevice", + RuntimeCounter(waveStats_.bytesToDevice, RuntimeCounter::Unit::kBytes)); + lockedStats->addRuntimeStat( + "wave.bytesToHost", + RuntimeCounter(waveStats_.bytesToHost, RuntimeCounter::Unit::kBytes)); + lockedStats->addRuntimeStat( + "wave.hostOnlyTime", + RuntimeCounter( + waveStats_.hostOnlyTime.micros * 1000, RuntimeCounter::Unit::kNanos)); + lockedStats->addRuntimeStat( + "wave.hostParallelTime", + RuntimeCounter( + waveStats_.hostParallelTime.micros * 1000, + RuntimeCounter::Unit::kNanos)); + lockedStats->addRuntimeStat( + "wave.waitTime", + RuntimeCounter( + waveStats_.waitTime.micros * 1000, RuntimeCounter::Unit::kNanos)); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/WaveDriver.h b/velox/experimental/wave/exec/WaveDriver.h index ea77b336f24eb..c80a5d8a2493c 100644 --- a/velox/experimental/wave/exec/WaveDriver.h +++ b/velox/experimental/wave/exec/WaveDriver.h @@ -57,6 +57,10 @@ class WaveDriver : public exec::SourceOperator { return *arena_; } + GpuArena& hostArena() const { + return *hostArena_; + } + const std::vector>& operands() { return operands_; } @@ -73,9 +77,11 @@ class WaveDriver : public exec::SourceOperator { std::string toString() const override; void addDynamicFilter( + const core::PlanNodeId& producer, column_index_t outputChannel, const std::shared_ptr& filter) override { - pipelines_[0].operators[0]->addDynamicFilter(outputChannel, filter); + pipelines_[0].operators[0]->addDynamicFilter( + producer, outputChannel, filter); } exec::OperatorCtx* operatorCtx() const { @@ -99,8 +105,7 @@ class WaveDriver : public exec::SourceOperator { // and there is space in the arena. void startMore(); - // Enqueus a prefetch from device to host for the buffers of output vectors. - void prefetchReturn(WaveStream& stream); + void updateStats(); std::unique_ptr arena_; std::unique_ptr deviceArena_; @@ -121,6 +126,10 @@ class WaveDriver : public exec::SourceOperator { // independently of each other. This is bounded by device memory and the // speed at which the source can produce new batches. std::list> streams; + /// True if status copy to host is needed after the last kernel. True if + /// returns vectors to host or if can produce multiple batches of output for + /// one input. + bool needStatus{false}; }; std::vector pipelines_; @@ -135,6 +144,7 @@ class WaveDriver : public exec::SourceOperator { SubfieldMap subfields_; // Operands handed over by compilation. std::vector> operands_; + WaveStats waveStats_; }; } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.cpp b/velox/experimental/wave/exec/WaveHiveDataSource.cpp index ddf5224287c6a..d6120f90b774f 100644 --- a/velox/experimental/wave/exec/WaveHiveDataSource.cpp +++ b/velox/experimental/wave/exec/WaveHiveDataSource.cpp @@ -94,8 +94,8 @@ void WaveHiveDataSource::addSplit( splitReader_->prepareSplit(metadataFilter_, runtimeStats_); } -int32_t WaveHiveDataSource::canAdvance() { - return splitReader_ != nullptr ? splitReader_->canAdvance() : 0; +int32_t WaveHiveDataSource::canAdvance(WaveStream& stream) { + return splitReader_ != nullptr ? splitReader_->canAdvance(stream) : 0; } void WaveHiveDataSource::schedule(WaveStream& stream, int32_t maxRows) { diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.h b/velox/experimental/wave/exec/WaveHiveDataSource.h index 45ad75621d304..01d2719b21c91 100644 --- a/velox/experimental/wave/exec/WaveHiveDataSource.h +++ b/velox/experimental/wave/exec/WaveHiveDataSource.h @@ -46,7 +46,7 @@ class WaveHiveDataSource : public WaveDataSource { void setFromDataSource(std::shared_ptr dataSource) override; - int32_t canAdvance() override; + int32_t canAdvance(WaveStream& stream) override; void schedule(WaveStream& stream, int32_t maxRows) override; diff --git a/velox/experimental/wave/exec/WaveOperator.cpp b/velox/experimental/wave/exec/WaveOperator.cpp index 66ed0f2dc506f..3ed89becfe652 100644 --- a/velox/experimental/wave/exec/WaveOperator.cpp +++ b/velox/experimental/wave/exec/WaveOperator.cpp @@ -24,35 +24,39 @@ WaveOperator::WaveOperator( CompileState& state, const RowTypePtr& type, const std::string& planNodeId) - : id_(state.numOperators()), planNodeId_(planNodeId), outputType_(type) { - definesSubfields(state, outputType_); -} + : id_(state.numOperators()), planNodeId_(planNodeId), outputType_(type) {} -void WaveOperator::definesSubfields( +AbstractOperand* WaveOperator::definesSubfield( CompileState& state, const TypePtr& type, - const std::string& parentPath) { + const std::string& parentPath, + bool sourceNullable) { switch (type->kind()) { case TypeKind::ROW: { auto& row = type->as(); for (auto i = 0; i < type->size(); ++i) { auto& child = row.childAt(i); auto name = row.nameOf(i); - auto field = state.toSubfield(name); - subfields_.push_back(field); - types_.push_back(child); - auto operand = state.findCurrentValue(Value(field)); - if (!operand) { - operand = state.newOperand(child, name); - } - outputIds_.add(operand->id); - defines_[Value(field)] = operand; + std::string childPath = fmt::format("{}.{}", parentPath, name); + definesSubfield(state, child, childPath, sourceNullable); } } [[fallthrough]]; // TODO:Add cases for nested types. default: { - return; + auto field = state.toSubfield(parentPath); + subfields_.push_back(field); + types_.push_back(type); + auto operand = state.findCurrentValue(Value(field)); + if (!operand) { + operand = state.newOperand(type, parentPath); + } + if (sourceNullable && !operand->notNull && !operand->conditionalNonNull) { + operand->sourceNullable = true; + } + defines_[Value(field)] = operand; + + return operand; } } } @@ -61,4 +65,10 @@ folly::Synchronized& WaveOperator::stats() { return driver_->stats(); } +std::string WaveOperator::toString() const { + std::stringstream out; + out << "Id: " << id_ << " produces " << outputIds_.toString() << std::endl; + return out.str(); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/WaveOperator.h b/velox/experimental/wave/exec/WaveOperator.h index 70f98e357246c..f0f157bcdd790 100644 --- a/velox/experimental/wave/exec/WaveOperator.h +++ b/velox/experimental/wave/exec/WaveOperator.h @@ -53,6 +53,10 @@ class WaveOperator { return isExpanding_; } + virtual bool isSource() const { + return false; + } + virtual bool isStreaming() const = 0; virtual void enqueue(WaveVectorPtr) { @@ -74,7 +78,7 @@ class WaveOperator { /// Returns how many rows of output are available from 'this'. Source /// operators and cardinality increasing operators must return a correct /// answer if they are ready to produce data. Others should return 0. - virtual int32_t canAdvance() { + virtual int32_t canAdvance(WaveStream& stream) { return 0; } @@ -93,12 +97,13 @@ class WaveOperator { VELOX_FAIL("Override for source or blocking operator"); } - virtual std::string toString() const = 0; + virtual std::string toString() const; - void definesSubfields( + AbstractOperand* definesSubfield( CompileState& state, const TypePtr& type, - const std::string& parentPath = ""); + const std::string& parentPath = "", + bool sourceNullable = false); /// Returns the operand if this is defined by 'this'. AbstractOperand* defines(Value value) { @@ -109,6 +114,11 @@ class WaveOperator { return it->second; } + /// Marks 'operand' as defined here. + void defined(Value value, AbstractOperand* op) { + defines_[value] = op; + } + void setDriver(WaveDriver* driver) { driver_ = driver; } @@ -124,12 +134,16 @@ class WaveOperator { return outputIds_; } + void addOutputId(OperandId id) { + outputIds_.add(id); + } + // The set of output operands that must have arrived for there to be a result. virtual const OperandSet& syncSet() const { return outputIds_; } - /// Called once on each Operator, fiest to last, after no more + /// Called once on each Operator, first to last, after no more /// Operators will be added to the WaveDriver plan. Can be used for /// e.g. making executable images of Programs since their content /// and dependences will no longer change. @@ -144,8 +158,9 @@ class WaveOperator { } virtual void addDynamicFilter( - column_index_t outputChannel, - const std::shared_ptr& filter) { + const core::PlanNodeId& /*producer*/, + column_index_t /*outputChannel*/, + const std::shared_ptr& /*filter*/) { VELOX_UNSUPPORTED(); } @@ -199,4 +214,16 @@ class WaveOperator { std::vector executableMemory_; }; +class WaveSourceOperator : public WaveOperator { + public: + WaveSourceOperator( + CompileState& state, + const RowTypePtr& outputType, + const std::string& planNodeId) + : WaveOperator(state, outputType, planNodeId) {} + bool isSource() const override { + return true; + } +}; + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/exec/WaveSplitReader.h b/velox/experimental/wave/exec/WaveSplitReader.h index 6ba1a7adb25a4..769f4ec500615 100644 --- a/velox/experimental/wave/exec/WaveSplitReader.h +++ b/velox/experimental/wave/exec/WaveSplitReader.h @@ -56,7 +56,7 @@ class WaveSplitReader { virtual bool emptySplit() = 0; - virtual int32_t canAdvance() = 0; + virtual int32_t canAdvance(WaveStream& stream) = 0; virtual void schedule(WaveStream& stream, int32_t maxRows) = 0; diff --git a/velox/experimental/wave/exec/tests/AggregationTest.cpp b/velox/experimental/wave/exec/tests/AggregationTest.cpp index 499cc36bad55a..61d8400e3c38c 100644 --- a/velox/experimental/wave/exec/tests/AggregationTest.cpp +++ b/velox/experimental/wave/exec/tests/AggregationTest.cpp @@ -227,9 +227,3 @@ TEST_F(AggregationTest, tpchQ1) { } // namespace } // namespace facebook::velox::wave - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - folly::Init follyInit(&argc, &argv); - return RUN_ALL_TESTS(); -} diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt index c6e8f81145e87..008261b7f1d44 100644 --- a/velox/experimental/wave/exec/tests/CMakeLists.txt +++ b/velox/experimental/wave/exec/tests/CMakeLists.txt @@ -15,7 +15,7 @@ add_subdirectory(utils) add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp - Main.cpp) + AggregationTest.cpp Main.cpp) add_test(velox_wave_exec_test velox_wave_exec_test) diff --git a/velox/experimental/wave/exec/tests/FilterProjectTest.cpp b/velox/experimental/wave/exec/tests/FilterProjectTest.cpp index 3b7fdb3a94ce9..230eba2fad8c0 100644 --- a/velox/experimental/wave/exec/tests/FilterProjectTest.cpp +++ b/velox/experimental/wave/exec/tests/FilterProjectTest.cpp @@ -67,6 +67,26 @@ class FilterProjectTest : public OperatorTestBase { auto task = assertQuery(plan, "SELECT c0, c1, c0 + c1 FROM tmp"); } + std::shared_ptr assertFilterProject( + const std::string& filter, + const std::vector& projections, + const std::vector& vectors) { + auto plan = PlanBuilder() + .values(vectors) + .filter(filter) + .project(projections) + .planNode(); + std::stringstream sql; + sql << "SELECT "; + for (auto i = 0; i < projections.size(); ++i) { + sql << " " << projections[i] << (i == projections.size() - 1 ? "" : ","); + } + sql << " FROM tmp WHERE " << filter; + + auto task = assertQuery(plan, sql.str()); + return task; + } + std::shared_ptr rowType_{ ROW({"c0", "c1", "c2", "c3"}, {BIGINT(), BIGINT(), SMALLINT(), DOUBLE()})}; @@ -96,3 +116,19 @@ TEST_F(FilterProjectTest, project) { assertProject(vectors); } + +TEST_F(FilterProjectTest, filterProject) { + std::vector vectors; + for (int32_t i = 0; i < 1; ++i) { + auto vector = std::dynamic_pointer_cast( + BatchMaker::createBatch(rowType_, 100, *pool_)); + makeNotNull(vector, 1000000000); + vectors.push_back(vector); + } + createDuckDbTable(vectors); + + assertFilterProject( + "c0 < 400000000", + std::vector{"c0", "c1", "c1 + c0 as s", "c2", "c3"}, + vectors); +} diff --git a/velox/experimental/wave/exec/tests/Main.cpp b/velox/experimental/wave/exec/tests/Main.cpp index 8bf768f54a66b..a54054a9a2667 100644 --- a/velox/experimental/wave/exec/tests/Main.cpp +++ b/velox/experimental/wave/exec/tests/Main.cpp @@ -17,7 +17,11 @@ #include #include +#include #include +#include "velox/experimental/wave/common/Cuda.h" + +DEFINE_bool(list_kernels, false, "Print register use of kernels"); // This main is needed for some tests on linux. int main(int argc, char** argv) { @@ -25,5 +29,8 @@ int main(int argc, char** argv) { // Signal handler required for ThreadDebugInfoTest facebook::velox::process::addDefaultFatalSignalHandler(); folly::Init init{&argc, &argv, false}; + if (FLAGS_list_kernels) { + facebook::velox::wave::printKernels(); + } return RUN_ALL_TESTS(); } diff --git a/velox/experimental/wave/exec/tests/TableScanTest.cpp b/velox/experimental/wave/exec/tests/TableScanTest.cpp index 0e6bbebdbd41e..955a3413f2b8b 100644 --- a/velox/experimental/wave/exec/tests/TableScanTest.cpp +++ b/velox/experimental/wave/exec/tests/TableScanTest.cpp @@ -52,6 +52,7 @@ class TableScanTest : public virtual HiveConnectorTestBase { void TearDown() override { wave::test::Table::dropAll(); + HiveConnectorTestBase::TearDown(); } std::vector makeVectors( @@ -69,6 +70,20 @@ class TableScanTest : public virtual HiveConnectorTestBase { return vectors; } + void makeNotNull( + RowVectorPtr row, + int64_t mod = std::numeric_limits::max()) { + for (auto i = 0; i < row->type()->size(); ++i) { + auto child = row->childAt(i); + if (auto ints = child->as>()) { + for (auto i = 0; i < child->size(); ++i) { + ints->set(i, ints->valueAt(i) % mod); + } + } + child->clearNulls(0, row->size()); + } + } + wave::test::SplitVector makeTable( const std::string& name, std::vector& rows) { @@ -144,10 +159,31 @@ TEST_F(TableScanTest, basic) { auto plan = tableScanNode(type); auto task = assertQuery(plan, splits, "SELECT * FROM tmp"); - // A quick sanity check for memory usage reporting. Check that peak total - // memory usage for the project node is > 0. auto planStats = toPlanStats(task->taskStats()); auto scanNodeId = plan->id(); auto it = planStats.find(scanNodeId); ASSERT_TRUE(it != planStats.end()); } + +TEST_F(TableScanTest, filter) { + auto type = + ROW({"c0", "c1", "c2", "c3"}, {BIGINT(), BIGINT(), BIGINT(), BIGINT()}); + auto vectors = makeVectors(type, 1, 1'000); + for (auto& vector : vectors) { + makeNotNull(vector, 1000000000); + } + auto splits = makeTable("test", vectors); + createDuckDbTable(vectors); + + auto plan = PlanBuilder(pool_.get()) + .tableScan(type) + .filter("c0 < 500000000") + .project({"c0", "c1 + 100000000 as c1", "c2", "c3"}) + .filter("c1 < 500000000") + .project({"c0", "c1", "c2 + 1", "c3", "c3 + 2"}) + .planNode(); + auto task = assertQuery( + plan, + splits, + "SELECT c0, c1 + 100000000, c2 + 1, c3, c3 + 2 FROM tmp where c0 < 500000000 and c1 + 100000000 < 500000000"); +} diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp index 487c1cded066d..d4ae6a8cab049 100644 --- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp +++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp @@ -43,7 +43,7 @@ WaveTestSplitReader::WaveTestSplitReader( true); } -int32_t WaveTestSplitReader::canAdvance() { +int32_t WaveTestSplitReader::canAdvance(WaveStream& stream) { if (!stripe_) { return 0; } diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h index 726a36677de10..2881f6de03f1e 100644 --- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h +++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h @@ -34,7 +34,7 @@ class WaveTestSplitReader : public WaveSplitReader { return !stripe_ || stripe_->columns[0]->numValues == 0; } - int32_t canAdvance() override; + int32_t canAdvance(WaveStream& stream) override; void schedule(WaveStream& stream, int32_t maxRows = 0) override; diff --git a/velox/experimental/wave/vector/Operand.h b/velox/experimental/wave/vector/Operand.h index cc3cdb8b75b9f..6a67798d0ae40 100644 --- a/velox/experimental/wave/vector/Operand.h +++ b/velox/experimental/wave/vector/Operand.h @@ -81,10 +81,18 @@ constexpr OperandId kNoOperand = ~0; using OperandIndex = uint16_t; constexpr OperandIndex kEmpty = ~0; -// operand indices above this are offsets into TB shared memory arrays. The -// value to use is the item at blockIx.x. +// operand indices above this are offsets into TB shared memory arrays. constexpr OperandIndex kMinSharedMemIndex = 0x8000; +// Number of nullable locals in shared memory. Each has kBlockSize null bytes at +// the start of the TB shared memory. 0 means no nulls. 1 means first kBlockSize +// bytes are nulls, 2 means second kBlockSize bytes are null flags etc. +constexpr uint16_t kSharedNullMask = 3; + +/// Start of the parameter array in the TB shared memory. 13 bits. Shift 1 left +/// to get offset. +constexpr uint16_t kSharedOperandMask = 0x7ffc; + /// Describes an operand for a Wave kernel instruction. The same /// insttruction is interpreted by multiple thread blocks in the /// kernel invocation. When accessing an operand, we have the base @@ -108,15 +116,35 @@ struct Operand { // Array of flat base values. Cast to pod type or StringView. void* base; + // Array of null indicators. No nulls if nullptr. A 1 means not-null, for + // consistency with Velox. + uint8_t* nulls; + // If non-nullptr, provides index into 'base. Subscripted with the // blockIdx - idx of first bllock wit this instruction // stream. Different thread blocks may or may not have indices for // a given operand. int32_t** indices; +}; - // Array of null indicators. No nulls if nullptr. A 1 means not-null, for - // consistency with Velox. - uint8_t* nulls; +/// Per-lane error code. +enum class ErrorCode : uint8_t { + // All operations completed. + kOk = 0, + + // Catchall for runtime errors. + kError, + + kInsufficientMemory, }; +/// Contains a count of active lanes and a per lane error code. +struct BlockStatus { + int32_t numRows{0}; + ErrorCode errors[kBlockSize]; +}; + +/// Returns the number of active rows in 'status' for 'numBlocks'. +int32_t statusNumRows(const BlockStatus* status, int32_t numBlocks); + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/vector/WaveVector.cpp b/velox/experimental/wave/vector/WaveVector.cpp index 544cfa536cd34..94a5cb4fffa73 100644 --- a/velox/experimental/wave/vector/WaveVector.cpp +++ b/velox/experimental/wave/vector/WaveVector.cpp @@ -24,7 +24,8 @@ namespace facebook::velox::wave { WaveVector::WaveVector( const TypePtr& type, GpuArena& arena, - std::vector> children) + std::vector> children, + bool notNull) : type_(type), kind_(type_->kind()), arena_(&arena), @@ -47,31 +48,33 @@ WaveVector::WaveVector( } } -void WaveVector::resize(vector_size_t size, bool nullable) { - if (size > size_) { - int64_t bytes; - if (type_->kind() == TypeKind::VARCHAR) { - bytes = sizeof(StringView) * size; - } else { - bytes = type_->cppSizeInBytes() * size; - } - if (!values_ || bytes > values_->capacity()) { - values_ = arena_->allocateBytes(bytes); - } - if (nullable) { - if (!nulls_ || nulls_->capacity() < size) { - nulls_ = arena_->allocateBytes(size); - } +void WaveVector::resize( + vector_size_t size, + bool nullable, + WaveBufferPtr* backing, + int64_t* backingOffset) { + auto capacity = values_ ? values_->capacity() : 0; + size_ = size; + int32_t bytesNeeded = backingSize(type_, size, nullable); + if (bytesNeeded > capacity) { + if (backing) { + values_ = WaveBufferView::create( + (*backing)->as() + *backingOffset, bytesNeeded, *backing); + *backingOffset += bytesNeeded; } else { - nulls_.reset(); + values_ = arena_->allocateBytes(bytesNeeded); } - size_ = size; + } + if (nullable) { + nulls_ = values_->as() + bytesNeeded - size; + } else { + nulls_ = nullptr; } } void WaveVector::toOperand(Operand* operand) const { operand->size = size_; - operand->nulls = nulls_ ? nulls_->as() : nullptr; + operand->nulls = nulls_; if (encoding_ == VectorEncoding::Simple::CONSTANT) { operand->indexMask = 0; operand->base = values_->as(); @@ -97,25 +100,32 @@ void toBits(uint64_t* words, int32_t numBytes) { } } +namespace { +class NoReleaser { + public: + void addRef() const {}; + void release() const {}; +}; + template static VectorPtr toVeloxTyped( vector_size_t size, velox::memory::MemoryPool* pool, const TypePtr& type, const WaveBufferPtr& values, - const WaveBufferPtr& nulls) { + const uint8_t* nulls) { using T = typename TypeTraits::NativeType; BufferPtr nullsView; if (nulls) { - nullsView = WaveBufferView::create(nulls); + nullsView = BufferView::create(nulls, size, NoReleaser()); toBits( const_cast(nullsView->as()), nullsView->capacity()); } BufferPtr valuesView; if (values) { - valuesView = WaveBufferView::create(values); + valuesView = VeloxWaveBufferView::create(values); } return std::make_shared>( @@ -127,9 +137,95 @@ static VectorPtr toVeloxTyped( std::vector()); } -VectorPtr WaveVector::toVelox(memory::MemoryPool* pool) { - return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL( +bool isDenselyFilled(const BlockStatus* status, int32_t numBlocks) { + for (int32_t i = 0; i < numBlocks - 1; ++i) { + if (status[i].numRows != kBlockSize) { + return false; + } + } + return true; +} +} // namespace + +int32_t statusNumRows(const BlockStatus* status, int32_t numBlocks) { + int32_t numRows = 0; + for (auto i = 0; i < numBlocks; ++i) { + numRows += status[i].numRows; + } + return numRows; +} + +// static +int32_t WaveVector::alignment(const TypePtr& type) { + switch (type->kind()) { + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + return sizeof(void*); + default: + return type->cppSizeInBytes(); + } +} + +// static +int64_t +WaveVector::backingSize(const TypePtr& type, int32_t size, bool nullable) { + int64_t bytes; + if (type->kind() == TypeKind::VARCHAR) { + bytes = sizeof(StringView) * size; + } else { + bytes = type->cppSizeInBytes() * size; + } + return bits::roundUp(bytes, sizeof(void*)) + (nullable ? size : 0); +} + +VectorPtr WaveVector::toVelox( + memory::MemoryPool* pool, + int32_t numBlocks, + const BlockStatus* status, + const Operand* operand) { + auto base = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL( toVeloxTyped, type_->kind(), size_, pool, type_, values_, nulls_); + if (!status || !operand) { + return base; + } + + // Translate the BlockStatus and indices in Operand to a host side dictionary + // wrap. + int maxRow = std::min(size_, numBlocks * kBlockSize); + numBlocks = bits::roundUp(maxRow, kBlockSize) / kBlockSize; + int numActive = statusNumRows(status, numBlocks); + auto operandIndices = operand->indices; + if (!operandIndices) { + // Vector sizes are >= active in status because they are allocated before + // the row count in status becomes known. + VELOX_CHECK_LE( + numActive, + size_, + "If there is no indirection in Operand, vector size must be <= BlockStatus"); + // If all blocks except last are filled we return base without wrap. + if (isDenselyFilled(status, numBlocks)) { + return base; + } + } + auto indices = AlignedBuffer::allocate(numActive, pool); + auto rawIndices = indices->asMutable(); + int32_t fill = 0; + for (auto block = 0; block < numBlocks; ++block) { + auto blockIndices = operandIndices ? operandIndices[block] : nullptr; + if (!blockIndices) { + for (auto i = 0; i < status[block].numRows; ++i) { + rawIndices[fill++] = block * kBlockSize + i; + } + } else { + memcpy( + rawIndices + fill, + blockIndices, + status[block].numRows * sizeof(int32_t)); + fill += status[block].numRows; + } + } + return BaseVector::wrapInDictionary( + BufferPtr(nullptr), indices, numActive, base); } } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/vector/WaveVector.h b/velox/experimental/wave/vector/WaveVector.h index d128bbd3c46af..aed7cfc32f45f 100644 --- a/velox/experimental/wave/vector/WaveVector.h +++ b/velox/experimental/wave/vector/WaveVector.h @@ -53,13 +53,14 @@ class WaveVector { } // Constructs a vector. Resize can be used to create buffers for a given size. - WaveVector(const TypePtr& type, GpuArena& arena) - : type_(type), kind_(type_->kind()), arena_(&arena) {} + WaveVector(const TypePtr& type, GpuArena& arena, bool notNull = false) + : type_(type), kind_(type_->kind()), arena_(&arena), notNull_(notNull) {} WaveVector( const TypePtr& type, GpuArena& arena, - std::vector> children); + std::vector> children, + bool notNull = false); const TypePtr& type() const { return type_; @@ -69,15 +70,28 @@ class WaveVector { return size_; } - void resize(vector_size_t sie, bool nullable = true); + /// Sets the size to 'size'. Allocates the backing memory from + /// 'arena_'. If 'backing' is non-nullptr, uses '*backing' for + /// backing store, starting at offset *backingOffset'. Returns the + /// offset of the first unused byte in '*backingOffset'. Leaves + /// contents uninitialized in all cases. + void resize( + vector_size_t size, + bool nullable = true, + WaveBufferPtr* backing = nullptr, + int64_t* backingOffset = nullptr); + + /// Returns the needed alignment for backing memory. + static int32_t alignment(const TypePtr& type); + + /// Returns the size in bytes for 'size' elements of 'type', including nulls + /// if 'nullable' is true. Does not include string buffers. + static int64_t backingSize(const TypePtr& type, int32_t size, bool nullable); bool mayHaveNulls() const { return nulls_ != nullptr; } - // Makes sure there is space for nulls. Initial value is undefined. - void ensureNulls(); - // Frees all allocated buffers. resize() can be used to populate the buffers // with a selected size. void clear(); @@ -95,15 +109,19 @@ class WaveVector { } uint8_t* nulls() { - if (nulls_) { - return nulls_->as(); - } - return nullptr; + return nulls_; } /// Returns a Velox vector giving a view on device side data. The device - /// buffers stay live while referenced by Velox. - VectorPtr toVelox(memory::MemoryPool* pool); + /// buffers stay live while referenced by Velox. If there is a selection, + /// numBlocks is the number of kBlockSize blocks the vector was allocated for, + /// BlockStatus gives the row counts per block and Operand gives the + /// dictionary indices representing the selection. + VectorPtr toVelox( + memory::MemoryPool* pool, + int32_t numBlocks = -1, + const BlockStatus* status = nullptr, + const Operand* operand = nullptr); /// Sets 'operand' to point to the buffers of 'this'. void toOperand(Operand* operand) const; @@ -126,31 +144,24 @@ class WaveVector { vector_size_t size_{0}; - // Values array, cast to pod type or StringView + // Values array, cast to pod type or StringView. If there are nulls, the null + // flags are in this buffer after the values, starting at 'null_' WaveBufferPtr values_; - // Nulls buffer, nullptr if no nulls. - WaveBufferPtr nulls_; + // Nulls, points to the tail of 'values'. nullptr if no nulls. + uint8_t* nulls_{nullptr}; // If dictionary or if wrapped in a selection, vector of indices into // 'values'. WaveBufferPtr indices_; - // Thread block level sizes. For each kBlockSize values, contains - // one int16_t that indicates how many of 'values' or 'indices' have - // a value. - WaveBufferPtr blockSizes_; - // Thread block level pointers inside 'indices_'. the ith entry is nullptr - // if the ith thread block has no row number mapping (all rows pass or none - // pass). - WaveBufferPtr blockIndices_; - // Lengths and offsets for array/map elements. WaveBufferPtr lengths_; WaveBufferPtr offsets_; // Members of a array/map/struct vector. std::vector> children_; + bool notNull_{false}; }; using WaveVectorPtr = std::unique_ptr; @@ -170,11 +181,17 @@ struct WaveReleaser { }; // A BufferView for velox::BaseVector for a view on unified memory. -class WaveBufferView : public BufferView { +class VeloxWaveBufferView : public BufferView { public: - static BufferPtr create(WaveBufferPtr buffer) { + /// Takes an additional reference to buffer. 'offset' and 'size' + /// allow sharing one allocation for many views. This is done when many + /// vectors have to be moved as a unit between device and host. + static BufferPtr + create(WaveBufferPtr buffer, int64_t offset = 0, int32_t size = -1) { return BufferView::create( - buffer->as(), buffer->capacity(), WaveReleaser(buffer)); + buffer->as() + offset, + size == -1 ? buffer->capacity() - offset : size, + WaveReleaser(buffer)); } }; diff --git a/velox/expression/CMakeLists.txt b/velox/expression/CMakeLists.txt index 3166487983811..c06c5be23fcb5 100644 --- a/velox/expression/CMakeLists.txt +++ b/velox/expression/CMakeLists.txt @@ -58,6 +58,5 @@ add_subdirectory(signature_parser) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) -elseif(${VELOX_BUILD_TEST_UTILS}) - add_subdirectory(tests/utils) + add_subdirectory(fuzzer) endif() diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp index 6bb3eec0c9b8c..0e6e6fb40d4ad 100644 --- a/velox/expression/CastExpr.cpp +++ b/velox/expression/CastExpr.cpp @@ -918,6 +918,8 @@ void CastExpr::evalSpecialForm( inTopLevel = true; if (nullOnFailure()) { ScopedVarSetter holder{context.mutableThrowOnError(), false}; + ScopedVarSetter captureErrorDetails( + context.mutableCaptureErrorDetails(), false); apply(rows, input, context, fromType, toType, result); } else { apply(rows, input, context, fromType, toType, result); diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp index 13c5aa520483d..f22f2fe376fa8 100644 --- a/velox/expression/EvalCtx.cpp +++ b/velox/expression/EvalCtx.cpp @@ -170,8 +170,35 @@ namespace { auto throwError(const std::exception_ptr& exceptionPtr) { std::rethrow_exception(toVeloxException(exceptionPtr)); } + +std::exception_ptr toVeloxUserError(const std::string& message) { + return std::make_exception_ptr(VeloxUserError( + __FILE__, + __LINE__, + __FUNCTION__, + "", + message, + error_source::kErrorSourceUser, + error_code::kInvalidArgument, + false /*retriable*/)); +} + } // namespace +void EvalCtx::setStatus(vector_size_t index, Status status) { + VELOX_CHECK(!status.ok(), "Status must be an error"); + + static std::exception_ptr kUserError = toVeloxUserError(""); + + if (status.isUserError()) { + setVeloxExceptionError( + index, + captureErrorDetails_ ? toVeloxUserError(status.message()) : kUserError); + } else { + VELOX_FAIL(status.message()); + } +} + void EvalCtx::setError( vector_size_t index, const std::exception_ptr& exceptionPtr) { diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h index 1f4d0de95cb46..8285553de40a8 100644 --- a/velox/expression/EvalCtx.h +++ b/velox/expression/EvalCtx.h @@ -78,6 +78,9 @@ class EvalCtx { void restore(ContextSaver& saver); + // @param status Must indicate an error. Cannot be "ok". + void setStatus(vector_size_t index, Status status); + // If exceptionPtr is known to be a VeloxException use setVeloxExceptionError // instead. void setError(vector_size_t index, const std::exception_ptr& exceptionPtr); @@ -189,6 +192,8 @@ class EvalCtx { errors_.reset(); } + /// Boolean indicating whether exceptions that occur during expression + /// evaluation should be thrown directly or saved for later processing. bool throwOnError() const { return throwOnError_; } @@ -197,6 +202,19 @@ class EvalCtx { return &throwOnError_; } + /// Boolean indicating whether to capture details when storing exceptions for + /// later processing (throwOnError_ == true). + /// + /// Conjunct expressions (AND, OR) require capturing error details, while TRY + /// and TRY_CAST expressions do not. + bool captureErrorDetails() const { + return captureErrorDetails_; + } + + bool* mutableCaptureErrorDetails() { + return &captureErrorDetails_; + } + bool nullsPruned() const { return nullsPruned_; } @@ -352,6 +370,8 @@ class EvalCtx { bool nullsPruned_{false}; bool throwOnError_{true}; + bool captureErrorDetails_{true}; + // True if the current set of rows will not grow, e.g. not under and IF or OR. bool isFinalSelection_{true}; diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index ed10a44bd9da6..05921fa432d36 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -696,14 +696,15 @@ std::string onTopLevelException(VeloxException::Type exceptionType, void* arg) { basePath = FLAGS_velox_save_input_on_expression_system_failure_path.c_str(); } if (strlen(basePath) == 0) { - return context->expr()->toString(); + return fmt::format("Top-level Expression: {}", context->expr()->toString()); } // Save input vector to a file. context->persistDataAndSql(basePath); return fmt::format( - "{}. Input data: {}. SQL expression: {}. All SQL expressions: {}.", + "Top-level Expression: {}. Input data: {}. SQL expression: {}." + " All SQL expressions: {}. ", context->expr()->toString(), context->dataPath(), context->sqlPath(), @@ -745,8 +746,9 @@ void Expr::evalFlatNoNullsImpl( const ExprSet* parentExprSet) { ExprExceptionContext exprExceptionContext{this, context.row(), parentExprSet}; ExceptionContextSetter exceptionContext( - {parentExprSet ? onTopLevelException : onException, - parentExprSet ? (void*)&exprExceptionContext : this}); + {.messageFunc = parentExprSet ? onTopLevelException : onException, + .arg = parentExprSet ? (void*)&exprExceptionContext : this, + .isEssential = parentExprSet != nullptr}); if (!rows.hasSelections()) { checkOrSetEmptyResult(type(), context.pool(), result); @@ -798,8 +800,9 @@ void Expr::eval( // exception. ExprExceptionContext exprExceptionContext{this, context.row(), parentExprSet}; ExceptionContextSetter exceptionContext( - {parentExprSet ? onTopLevelException : onException, - parentExprSet ? (void*)&exprExceptionContext : this}); + {.messageFunc = parentExprSet ? onTopLevelException : onException, + .arg = parentExprSet ? (void*)&exprExceptionContext : this, + .isEssential = parentExprSet != nullptr}); if (!rows.hasSelections()) { checkOrSetEmptyResult(type(), context.pool(), result); diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index e999a36caaa84..14a947c36206a 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -819,14 +819,10 @@ class ExprSetListener { const ExprSetCompletionEvent& event) = 0; /// Called when a batch of rows encounters errors processing one or more - /// rows in a try expression to provide information about these errors. This - /// function must neither change rows nor errors. - /// @param rows Rows where errors exist. - /// @param errors Error vector produced inside the try expression. - virtual void onError( - const SelectivityVector& rows, - const ErrorVector& errors, - const std::string& queryId) = 0; + /// rows in a try expression to provide information about these errors. + /// @param numRows Number of rows with errors. + /// @param queryId Query ID. + virtual void onError(vector_size_t numRows, const std::string& queryId) = 0; }; /// Return the ExprSetListeners having been registered. diff --git a/velox/expression/SimpleFunctionAdapter.h b/velox/expression/SimpleFunctionAdapter.h index b6b0aae5f59ca..8c0bb54d5b12c 100644 --- a/velox/expression/SimpleFunctionAdapter.h +++ b/velox/expression/SimpleFunctionAdapter.h @@ -16,11 +16,13 @@ #pragma once +#include #include #include #include #include "velox/common/base/Portability.h" +#include "velox/common/base/Status.h" #include "velox/expression/ComplexWriterTypes.h" #include "velox/expression/DecodedArgs.h" #include "velox/expression/Expr.h" @@ -195,6 +197,10 @@ class SimpleFunctionAdapter : public VectorFunction { context.template applyToSelectedNoThrow(*rows, func); } + void setError(vector_size_t row, Status status) { + context.setStatus(row, status); + } + const SelectivityVector* rows; result_vector_t* result; VectorWriter resultWriter; @@ -618,7 +624,11 @@ class SimpleFunctionAdapter : public VectorFunction { // Result is NULL because the input contains NULL. notNull = false; } else { - notNull = doApplyNullFree<0>(row, out, readers...); + auto status = doApplyNullFree<0>(row, out, notNull, readers...); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + return; + } } writeResult(row, notNull, out); @@ -626,7 +636,12 @@ class SimpleFunctionAdapter : public VectorFunction { } else { applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA { typename return_type_traits::NativeType out{}; - bool notNull = doApplyNullFree<0>(row, out, readers...); + bool notNull; + auto status = doApplyNullFree<0>(row, out, notNull, readers...); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + return; + } writeResult(row, notNull, out); }); @@ -636,7 +651,13 @@ class SimpleFunctionAdapter : public VectorFunction { if (applyContext.allAscii) { applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA { typename return_type_traits::NativeType out{}; - bool notNull = doApplyAsciiNotNull<0>(row, out, readers...); + bool notNull; + auto status = + doApplyAsciiNotNull<0>(row, out, notNull, readers...); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + return; + } writeResult(row, notNull, out); }); return; @@ -648,13 +669,23 @@ class SimpleFunctionAdapter : public VectorFunction { // optimization (eliminating the temp) is easier to do by the // compiler (assuming the function call is inlined). typename return_type_traits::NativeType out{}; - bool notNull = doApplyNotNull<0>(row, out, readers...); + bool notNull; + auto status = doApplyNotNull<0>(row, out, notNull, readers...); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + return; + } writeResult(row, notNull, out); }); } else { applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA { typename return_type_traits::NativeType out{}; - bool notNull = doApply<0>(row, out, readers...); + bool notNull; + auto status = doApply<0>(row, out, notNull, readers...); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + return; + } writeResult(row, notNull, out); }); } @@ -664,36 +695,47 @@ class SimpleFunctionAdapter : public VectorFunction { // once per batch instead of once per row shows a significant // performance improvement when there are no nulls. if (applyContext.mayHaveNullsRecursive) { - applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA { - auto containsNull = (readers.containsNull(row) || ...); - if (containsNull) { - // Result is NULL because the input contains NULL. - return false; - } - - return doApplyNullFree<0>(row, out, readers...); - }); + applyUdf( + applyContext, + [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA { + auto containsNull = (readers.containsNull(row) || ...); + if (containsNull) { + // Result is NULL because the input contains NULL. + notNull = false; + return Status::OK(); + } + + return doApplyNullFree<0>(row, out, notNull, readers...); + }); } else { - applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA { - return doApplyNullFree<0>(row, out, readers...); - }); + applyUdf( + applyContext, + [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA { + return doApplyNullFree<0>(row, out, notNull, readers...); + }); } } else if (allNotNull) { if constexpr (FUNC::has_ascii) { if (applyContext.allAscii) { - applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA { - return doApplyAsciiNotNull<0>(row, out, readers...); - }); + applyUdf( + applyContext, + [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA { + return doApplyAsciiNotNull<0>(row, out, notNull, readers...); + }); return; } } - applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA { - return doApplyNotNull<0>(row, out, readers...); - }); + applyUdf( + applyContext, + [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA { + return doApplyNotNull<0>(row, out, notNull, readers...); + }); } else { - applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA { - return doApply<0>(row, out, readers...); - }); + applyUdf( + applyContext, + [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA { + return doApply<0>(row, out, notNull, readers...); + }); } } } @@ -709,16 +751,26 @@ class SimpleFunctionAdapter : public VectorFunction { applyContext.resultWriter.setOffset(row); // Force local copy of proxy. auto localWriter = currentWriter; - auto notNull = func(localWriter, row); - currentWriter = localWriter; - applyContext.resultWriter.commit(notNull); + bool notNull; + auto status = func(localWriter, notNull, row); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + } else { + currentWriter = localWriter; + applyContext.resultWriter.commit(notNull); + } }); applyContext.resultWriter.finish(); } else { applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA { applyContext.resultWriter.setOffset(row); - applyContext.resultWriter.commit( - func(applyContext.resultWriter.current(), row)); + bool notNull; + auto status = func(applyContext.resultWriter.current(), notNull, row); + if UNLIKELY (!status.ok()) { + applyContext.setError(row, status); + } else { + applyContext.resultWriter.commit(notNull); + } }); } } @@ -734,10 +786,11 @@ class SimpleFunctionAdapter : public VectorFunction { typename... Values, std::enable_if_t< POSITION = - 0> FOLLY_ALWAYS_INLINE bool + 0> FOLLY_ALWAYS_INLINE Status doApply( size_t idx, T& target, + bool& notNull, R0& currentReader, const Values&... extra) const { if (LIKELY(currentReader.isSet(idx))) { @@ -745,9 +798,10 @@ class SimpleFunctionAdapter : public VectorFunction { decltype(currentReader[idx]) v0 = currentReader[idx]; // recurse through the readers to build the arg list at compile time. - return doApply(idx, target, extra..., v0); + return doApply(idx, target, notNull, extra..., v0); } else { - return false; + notNull = false; + return Status::OK(); } } @@ -758,10 +812,11 @@ class SimpleFunctionAdapter : public VectorFunction { typename... Values, std::enable_if_t< POSITION = - 0> FOLLY_ALWAYS_INLINE bool + 0> FOLLY_ALWAYS_INLINE Status doApply( size_t idx, T& target, + bool& notNull, R0& currentReader, const Values&... extra) const { // Recurse through all the arguments to build the arg list at compile @@ -770,16 +825,17 @@ class SimpleFunctionAdapter : public VectorFunction { return doApply( idx, target, + notNull, extra..., (currentReader.isSet(idx) ? ¤tReader[idx] : nullptr)); } else { using temp_type = std::remove_reference_t; if (currentReader.isSet(idx)) { temp_type temp = currentReader[idx]; - return doApply(idx, target, extra..., &temp); + return doApply(idx, target, notNull, extra..., &temp); } else { return doApply( - idx, target, extra..., (const temp_type*)nullptr); + idx, target, notNull, extra..., (const temp_type*)nullptr); } } } @@ -791,9 +847,12 @@ class SimpleFunctionAdapter : public VectorFunction { std::enable_if_t< POSITION == FUNC::num_args && FUNC::is_default_null_behavior, int32_t> = 0> - FOLLY_ALWAYS_INLINE bool - doApply(size_t /*idx*/, T& target, const Values&... values) const { - return (*fn_).call(target, values...); + FOLLY_ALWAYS_INLINE Status doApply( + size_t /*idx*/, + T& target, + bool& notNull, + const Values&... values) const { + return (*fn_).call(target, notNull, values...); } // For NOT default null behavior, terminate with UDFHolder::callNullable. @@ -803,9 +862,12 @@ class SimpleFunctionAdapter : public VectorFunction { std::enable_if_t< POSITION == FUNC::num_args && !FUNC::is_default_null_behavior, int32_t> = 0> - FOLLY_ALWAYS_INLINE bool - doApply(size_t /*idx*/, T& target, const Values*... values) const { - return (*fn_).callNullable(target, values...); + FOLLY_ALWAYS_INLINE Status doApply( + size_t /*idx*/, + T& target, + bool& notNull, + const Values*... values) const { + return (*fn_).callNullable(target, notNull, values...); } // == NOT-NULL VARIANT == @@ -822,13 +884,14 @@ class SimpleFunctionAdapter : public VectorFunction { typename R0, typename... TStuff, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool doApplyNotNull( + FOLLY_ALWAYS_INLINE Status doApplyNotNull( size_t idx, T& target, + bool& notNull, R0& currentReader, const TStuff&... extra) const { decltype(currentReader[idx]) v0 = currentReader[idx]; - return doApplyNotNull(idx, target, extra..., v0); + return doApplyNotNull(idx, target, notNull, extra..., v0); } // For default null behavior, Terminate by with UDFHolder::call. @@ -836,9 +899,12 @@ class SimpleFunctionAdapter : public VectorFunction { size_t POSITION, typename... Values, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool - doApplyNotNull(size_t /*idx*/, T& target, const Values&... values) const { - return (*fn_).call(target, values...); + FOLLY_ALWAYS_INLINE Status doApplyNotNull( + size_t /*idx*/, + T& target, + bool& notNull, + const Values&... values) const { + return (*fn_).call(target, notNull, values...); } template < @@ -846,24 +912,27 @@ class SimpleFunctionAdapter : public VectorFunction { typename R0, typename... TStuff, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool doApplyAsciiNotNull( + FOLLY_ALWAYS_INLINE Status doApplyAsciiNotNull( size_t idx, T& target, + bool& notNull, R0& currentReader, const TStuff&... extra) const { decltype(currentReader[idx]) v0 = currentReader[idx]; - return doApplyAsciiNotNull(idx, target, extra..., v0); + return doApplyAsciiNotNull( + idx, target, notNull, extra..., v0); } template < size_t POSITION, typename... Values, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool doApplyAsciiNotNull( + FOLLY_ALWAYS_INLINE Status doApplyAsciiNotNull( size_t /*idx*/, T& target, + bool& notNull, const Values&... values) const { - return (*fn_).callAscii(target, values...); + return (*fn_).callAscii(target, notNull, values...); } template < @@ -871,22 +940,26 @@ class SimpleFunctionAdapter : public VectorFunction { typename R0, typename... TStuff, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool doApplyNullFree( + FOLLY_ALWAYS_INLINE Status doApplyNullFree( size_t idx, T& target, + bool& notNull, R0& currentReader, const TStuff&... extra) const { auto v0 = currentReader.readNullFree(idx); - return doApplyNullFree(idx, target, extra..., v0); + return doApplyNullFree(idx, target, notNull, extra..., v0); } template < size_t POSITION, typename... Values, std::enable_if_t = 0> - FOLLY_ALWAYS_INLINE bool - doApplyNullFree(size_t /*idx*/, T& target, const Values&... values) const { - return (*fn_).callNullFree(target, values...); + FOLLY_ALWAYS_INLINE Status doApplyNullFree( + size_t /*idx*/, + T& target, + bool& notNull, + const Values&... values) const { + return (*fn_).callNullFree(target, notNull, values...); } }; diff --git a/velox/expression/TryExpr.cpp b/velox/expression/TryExpr.cpp index fb840eedf9731..8850f3ac4e542 100644 --- a/velox/expression/TryExpr.cpp +++ b/velox/expression/TryExpr.cpp @@ -23,6 +23,9 @@ void TryExpr::evalSpecialForm( EvalCtx& context, VectorPtr& result) { ScopedVarSetter throwOnError(context.mutableThrowOnError(), false); + ScopedVarSetter captureErrorDetails( + context.mutableCaptureErrorDetails(), false); + // It's possible with nested TRY expressions that some rows already threw // exceptions in earlier expressions that haven't been handled yet. To avoid // incorrectly handling them here, store those errors and temporarily reset @@ -42,6 +45,9 @@ void TryExpr::evalSpecialFormSimplified( EvalCtx& context, VectorPtr& result) { ScopedVarSetter throwOnError(context.mutableThrowOnError(), false); + ScopedVarSetter captureErrorDetails( + context.mutableCaptureErrorDetails(), false); + // It's possible with nested TRY expressions that some rows already threw // exceptions in earlier expressions that haven't been handled yet. To avoid // incorrectly handling them here, store those errors and temporarily reset @@ -66,24 +72,21 @@ void applyListenersOnError( auto errors = context.errors(); VELOX_CHECK_NOT_NULL(errors); - exec::LocalSelectivityVector errorRows(context.execCtx(), errors->size()); - errorRows->clearAll(); + vector_size_t numErrors = 0; rows.applyToSelected([&](auto row) { if (row < errors->size() && !errors->isNullAt(row)) { - errorRows->setValid(row, true); + ++numErrors; } }); - errorRows->updateBounds(); - if (!errorRows->hasSelections()) { + if (numErrors == 0) { return; } exprSetListeners().withRLock([&](auto& listeners) { if (!listeners.empty()) { for (auto& listener : listeners) { - listener->onError( - *errorRows, *errors, context.execCtx()->queryCtx()->queryId()); + listener->onError(numErrors, context.execCtx()->queryCtx()->queryId()); } } }); diff --git a/velox/expression/fuzzer/ArgGenerator.h b/velox/expression/fuzzer/ArgGenerator.h new file mode 100644 index 0000000000000..4c015f7a38c79 --- /dev/null +++ b/velox/expression/fuzzer/ArgGenerator.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/FunctionSignature.h" +#include "velox/vector/fuzzer/Utils.h" + +namespace facebook::velox::fuzzer { + +/// Generates random, but valid input types for a specified function signature +/// with the return type. +class ArgGenerator { + public: + virtual ~ArgGenerator() = default; + + /// Given a signature and a concrete return type returns randomly selected + /// valid input types. Returns empty vector if no input types can produce the + /// specified result type. + virtual std::vector generateArgs( + const exec::FunctionSignature& signature, + const TypePtr& returnType, + FuzzerGenerator& rng) = 0; +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/utils/ArgumentTypeFuzzer.cpp b/velox/expression/fuzzer/ArgumentTypeFuzzer.cpp similarity index 98% rename from velox/expression/tests/utils/ArgumentTypeFuzzer.cpp rename to velox/expression/fuzzer/ArgumentTypeFuzzer.cpp index 1b32b5c946746..1d7b784f29905 100644 --- a/velox/expression/tests/utils/ArgumentTypeFuzzer.cpp +++ b/velox/expression/fuzzer/ArgumentTypeFuzzer.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h" +#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h" #include #include @@ -24,7 +24,7 @@ #include "velox/type/Type.h" #include "velox/vector/fuzzer/VectorFuzzer.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { std::string typeToBaseName(const TypePtr& type) { if (type->isDecimal()) { @@ -226,4 +226,4 @@ int32_t ArgumentTypeFuzzer::rand32(int32_t min, int32_t max) { return boost::random::uniform_int_distribution(min, max)(rng_); } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/utils/ArgumentTypeFuzzer.h b/velox/expression/fuzzer/ArgumentTypeFuzzer.h similarity index 98% rename from velox/expression/tests/utils/ArgumentTypeFuzzer.h rename to velox/expression/fuzzer/ArgumentTypeFuzzer.h index f21ced70c5212..9a01ef9c5e4a2 100644 --- a/velox/expression/tests/utils/ArgumentTypeFuzzer.h +++ b/velox/expression/fuzzer/ArgumentTypeFuzzer.h @@ -22,7 +22,7 @@ #include "velox/expression/SignatureBinder.h" #include "velox/type/Type.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { /// For function signatures using type variables, generates a list of /// arguments types. Optionally, allows to specify a desired return type. If @@ -104,4 +104,4 @@ std::string typeToBaseName(const TypePtr& type); /// Return the TypeKind that corresponds to typeName. std::optional baseNameToTypeKind(const std::string& typeName); -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/CMakeLists.txt b/velox/expression/fuzzer/CMakeLists.txt new file mode 100644 index 0000000000000..adb2949e7b40c --- /dev/null +++ b/velox/expression/fuzzer/CMakeLists.txt @@ -0,0 +1,47 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(velox_expression_test_utility ArgumentTypeFuzzer.cpp + FuzzerToolkit.cpp) + +target_link_libraries(velox_expression_test_utility velox_type + velox_expression_functions gtest) + +add_library( + velox_expression_fuzzer + ArgumentTypeFuzzer.cpp DecimalArgGeneratorBase.cpp ExpressionFuzzer.cpp + FuzzerRunner.cpp ExpressionFuzzerVerifier.cpp) + +target_link_libraries( + velox_expression_fuzzer + velox_expression_verifier + velox_type + velox_vector_fuzzer + velox_vector_test_lib + velox_function_registry + velox_expression_test_utility) + +add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp) + +target_link_libraries(velox_expression_fuzzer_test velox_expression_fuzzer + velox_functions_prestosql gtest gtest_main) + +add_executable(spark_expression_fuzzer_test SparkExpressionFuzzerTest.cpp) + +target_link_libraries(spark_expression_fuzzer_test velox_expression_fuzzer + velox_functions_spark gtest gtest_main) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp b/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp new file mode 100644 index 0000000000000..1b65b67658c56 --- /dev/null +++ b/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/expression/fuzzer/DecimalArgGeneratorBase.h" +#include + +namespace facebook::velox::fuzzer { +namespace { + +// Returns all the possible decimal types. +const std::vector& getAllTypes() { + const auto generateAllTypes = []() { + std::vector allTypes; + for (auto p = 1; p <= 38; ++p) { + for (auto s = 0; s <= p; ++s) { + allTypes.push_back(DECIMAL(p, s)); + } + } + return allTypes; + }; + + static const std::vector allTypes = generateAllTypes(); + return allTypes; +} + +uint32_t rand32(uint32_t max, FuzzerGenerator& rng) { + return boost::random::uniform_int_distribution()(rng) % max; +} +} // namespace + +std::vector DecimalArgGeneratorBase::generateArgs( + const exec::FunctionSignature& /*signature*/, + const TypePtr& returnType, + FuzzerGenerator& rng) { + auto inputs = findInputs(returnType, rng); + for (const auto& input : inputs) { + if (input == nullptr) { + return {}; + } + } + return inputs; +} + +void DecimalArgGeneratorBase::initialize(uint32_t numArgs) { + switch (numArgs) { + case 1: { + for (const auto& t : getAllTypes()) { + auto [p, s] = getDecimalPrecisionScale(*t); + if (auto returnType = toReturnType(p, s)) { + inputs_[returnType.value()].push_back({t}); + } + } + break; + } + case 2: { + for (const auto& a : getAllTypes()) { + for (const auto& b : getAllTypes()) { + auto [p1, s1] = getDecimalPrecisionScale(*a); + auto [p2, s2] = getDecimalPrecisionScale(*b); + + if (auto returnType = toReturnType(p1, s1, p2, s2)) { + inputs_[returnType.value()].push_back({a, b}); + } + } + } + break; + } + default: + VELOX_NYI( + "Initialization with {} argument types is not supported.", numArgs); + } +} + +std::vector DecimalArgGeneratorBase::findInputs( + const TypePtr& returnType, + FuzzerGenerator& rng) const { + const auto [p, s] = getDecimalPrecisionScale(*returnType); + const auto it = inputs_.find({p, s}); + if (it == inputs_.end()) { + VLOG(1) << "Cannot find input types for " << returnType->toString(); + return {}; + } + + const auto index = rand32(it->second.size(), rng); + return it->second[index]; +} +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/DecimalArgGeneratorBase.h b/velox/expression/fuzzer/DecimalArgGeneratorBase.h new file mode 100644 index 0000000000000..c27db1b8d264f --- /dev/null +++ b/velox/expression/fuzzer/DecimalArgGeneratorBase.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/fuzzer/ArgGenerator.h" + +namespace facebook::velox::fuzzer { + +/// An argument type generator for decimal function. A map keyed on the pair of +/// precision and scale could be initialized with all possible input types. +/// Argument types are generated by looking up the map with the precision and +/// scale of return type, and randomly selecting valid input types. Derived +/// classes should call 'initialize' from the constructor and specify the number +/// of decimal arguments. They should also implement toReturnType with matching +/// number of pairs of precision and scale. +class DecimalArgGeneratorBase : public ArgGenerator { + public: + std::vector generateArgs( + const exec::FunctionSignature& signature, + const TypePtr& returnType, + FuzzerGenerator& rng) override; + + protected: + // Computes result type for all possible pairs of decimal input types. Stores + // the results in 'inputs_' map keyed by the precision and scale of return + // type. + // @param numArgs the number of decimal argument types. It only supports + // initialization with one or two argument types. + virtual void initialize(uint32_t numArgs); + + // Given precisions and scales of the inputs, returns precision and scale of + // the result. Returns std::nullopt if a valid return type cannot be generated + // with inputs. Used when the return type is generated with one pair of input + // precision and scale. + virtual std::optional> toReturnType(int p, int s) { + VELOX_UNREACHABLE(); + } + + // Used when the return type is generated with two pairs of input precision + // and scale. + virtual std::optional> + toReturnType(int p1, int s1, int p2, int s2) { + VELOX_UNREACHABLE(); + } + + private: + // Returns randomly selected pair of input types that produce the specified + // result type. + std::vector findInputs( + const TypePtr& returnType, + FuzzerGenerator& rng) const; + + // Maps from the precision and scale of return type to corresponding input + // types. + std::unordered_map, std::vector>> + inputs_; +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/ExpressionFuzzer.cpp b/velox/expression/fuzzer/ExpressionFuzzer.cpp similarity index 99% rename from velox/expression/tests/ExpressionFuzzer.cpp rename to velox/expression/fuzzer/ExpressionFuzzer.cpp index f8bdf6c59c874..4817d71ac4627 100644 --- a/velox/expression/tests/ExpressionFuzzer.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzer.cpp @@ -26,10 +26,10 @@ #include "velox/expression/FunctionSignature.h" #include "velox/expression/ReverseSignatureBinder.h" #include "velox/expression/SimpleFunctionRegistry.h" -#include "velox/expression/tests/ExpressionFuzzer.h" -#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h" +#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { namespace { using exec::SignatureBinder; @@ -1384,4 +1384,4 @@ RowTypePtr ExpressionFuzzer::fuzzRowReturnType(size_t size, char prefix) { return ROW(std::move(names), std::move(children)); } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/ExpressionFuzzer.h b/velox/expression/fuzzer/ExpressionFuzzer.h similarity index 99% rename from velox/expression/tests/ExpressionFuzzer.h rename to velox/expression/fuzzer/ExpressionFuzzer.h index fcc7bee02a27a..1e0dc9a74f748 100644 --- a/velox/expression/tests/ExpressionFuzzer.h +++ b/velox/expression/fuzzer/ExpressionFuzzer.h @@ -19,13 +19,13 @@ #include "velox/core/ITypedExpr.h" #include "velox/core/QueryCtx.h" #include "velox/expression/Expr.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/expression/tests/ExpressionVerifier.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" #include "velox/functions/FunctionRegistry.h" #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorMaker.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { // A tool that can be used to generate random expressions. class ExpressionFuzzer { @@ -418,4 +418,4 @@ class ExpressionFuzzer { friend class ExpressionFuzzerUnitTest; }; -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp similarity index 94% rename from velox/expression/tests/ExpressionFuzzerTest.cpp rename to velox/expression/fuzzer/ExpressionFuzzerTest.cpp index e6c0985c255a9..e9f35f02f7702 100644 --- a/velox/expression/tests/ExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp @@ -18,7 +18,7 @@ #include #include -#include "velox/expression/tests/FuzzerRunner.h" +#include "velox/expression/fuzzer/FuzzerRunner.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" DEFINE_int64( @@ -27,7 +27,7 @@ DEFINE_int64( "Initial seed for random number generator used to reproduce previous " "results (0 means start with random seed)."); -using facebook::velox::test::FuzzerRunner; +using facebook::velox::fuzzer::FuzzerRunner; int main(int argc, char** argv) { facebook::velox::functions::prestosql::registerAllScalarFunctions(); @@ -64,7 +64,6 @@ int main(int argc, char** argv) { "regexp_extract", "regexp_extract_all", "regexp_like", - "map_top_n", // https://github.com/facebookincubator/velox/issues/9497 }; size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; return FuzzerRunner::run(initialSeed, skipFunctions, {{}}); diff --git a/velox/expression/tests/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp similarity index 96% rename from velox/expression/tests/ExpressionFuzzerVerifier.cpp rename to velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp index 8e36fd739f21c..9ea2d1e0421c3 100644 --- a/velox/expression/tests/ExpressionFuzzerVerifier.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/expression/tests/ExpressionFuzzerVerifier.h" +#include "velox/expression/fuzzer/ExpressionFuzzerVerifier.h" #include #include @@ -24,9 +24,9 @@ #include "velox/expression/Expr.h" #include "velox/expression/FunctionSignature.h" #include "velox/expression/ReverseSignatureBinder.h" -#include "velox/expression/tests/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { namespace { @@ -250,9 +250,9 @@ void ExpressionFuzzerVerifier::retryWithTry( false, // canThrow columnsToWrapInLazy) .result; - } catch (const std::exception& e) { + } catch (const std::exception&) { if (options_.findMinimalSubexpression) { - computeMinimumSubExpression( + test::computeMinimumSubExpression( {&execCtx_, {false, ""}}, *vectorFuzzer_, plans, @@ -281,9 +281,9 @@ void ExpressionFuzzerVerifier::retryWithTry( : nullptr, false, // canThrow columnsToWrapInLazy); - } catch (const std::exception& e) { + } catch (const std::exception&) { if (options_.findMinimalSubexpression) { - computeMinimumSubExpression( + test::computeMinimumSubExpression( {&execCtx_, {false, ""}}, *vectorFuzzer_, plans, @@ -339,9 +339,9 @@ void ExpressionFuzzerVerifier::go() { resultVectors ? BaseVector::copy(*resultVectors) : nullptr, true, // canThrow columnsToWrapInLazy); - } catch (const std::exception& e) { + } catch (const std::exception&) { if (options_.findMinimalSubexpression) { - computeMinimumSubExpression( + test::computeMinimumSubExpression( {&execCtx_, {false, ""}}, *vectorFuzzer_, plans, @@ -374,4 +374,4 @@ void ExpressionFuzzerVerifier::go() { LOG(ERROR) << "Total failed: " << numFailed; } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/ExpressionFuzzerVerifier.h b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h similarity index 95% rename from velox/expression/tests/ExpressionFuzzerVerifier.h rename to velox/expression/fuzzer/ExpressionFuzzerVerifier.h index 2f85b5d52bc71..f651ad5541430 100644 --- a/velox/expression/tests/ExpressionFuzzerVerifier.h +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h @@ -19,16 +19,16 @@ #include "velox/core/ITypedExpr.h" #include "velox/core/QueryCtx.h" #include "velox/expression/Expr.h" -#include "velox/expression/tests/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/expression/tests/ExpressionVerifier.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" #include "velox/functions/FunctionRegistry.h" #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorMaker.h" DECLARE_int32(velox_fuzzer_max_level_of_nesting); -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { // A tool that utilizes ExpressionFuzzer, VectorFuzzer and ExpressionVerfier to // generate random expressions and verify the correctness of the results. It @@ -139,10 +139,8 @@ class ExpressionFuzzerVerifier { // A no-op since we cannot tie errors directly to functions where they // occurred. - void onError( - const SelectivityVector& /*rows*/, - const ::facebook::velox::ErrorVector& /*errors*/, - const std::string& /*queryId*/) override {} + void onError(vector_size_t /*numRows*/, const std::string& /*queryId*/) + override {} private: std::unordered_map& exprNameToStats_; @@ -212,4 +210,4 @@ class ExpressionFuzzerVerifier { ExpressionFuzzer expressionFuzzer_; }; -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/FuzzerRunner.cpp b/velox/expression/fuzzer/FuzzerRunner.cpp similarity index 97% rename from velox/expression/tests/FuzzerRunner.cpp rename to velox/expression/fuzzer/FuzzerRunner.cpp index ac741944dfa61..56c58a3658413 100644 --- a/velox/expression/tests/FuzzerRunner.cpp +++ b/velox/expression/fuzzer/FuzzerRunner.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "velox/expression/tests/FuzzerRunner.h" -#include "velox/expression/tests/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/FuzzerRunner.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" DEFINE_int32(steps, 10, "Number of expressions to generate and execute."); @@ -148,7 +148,7 @@ DEFINE_string( "of functions at every instance. Number of tickets must be a positive " "integer. Example: eq=3,floor=5"); -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { namespace { VectorFuzzer::Options getVectorFuzzerOptions() { @@ -222,4 +222,4 @@ void FuzzerRunner::runFromGtest( getExpressionFuzzerVerifierOptions(skipFunctions, queryConfigs)) .go(); } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/FuzzerRunner.h b/velox/expression/fuzzer/FuzzerRunner.h similarity index 90% rename from velox/expression/tests/FuzzerRunner.h rename to velox/expression/fuzzer/FuzzerRunner.h index cbf3d5ac290a9..0eda0ecd1d7a9 100644 --- a/velox/expression/tests/FuzzerRunner.h +++ b/velox/expression/fuzzer/FuzzerRunner.h @@ -22,10 +22,10 @@ #include #include -#include "velox/expression/tests/ExpressionFuzzerVerifier.h" +#include "velox/expression/fuzzer/ExpressionFuzzerVerifier.h" #include "velox/functions/FunctionRegistry.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { /// FuzzerRunner leverages ExpressionFuzzerVerifier to create a gtest unit test. class FuzzerRunner { @@ -41,4 +41,4 @@ class FuzzerRunner { const std::unordered_map& queryConfigs); }; -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/utils/FuzzerToolkit.cpp b/velox/expression/fuzzer/FuzzerToolkit.cpp similarity index 97% rename from velox/expression/tests/utils/FuzzerToolkit.cpp rename to velox/expression/fuzzer/FuzzerToolkit.cpp index cbbc60b4c08c1..292f4619bb661 100644 --- a/velox/expression/tests/utils/FuzzerToolkit.cpp +++ b/velox/expression/fuzzer/FuzzerToolkit.cpp @@ -13,9 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "velox/expression/tests/utils/FuzzerToolkit.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { std::string CallableSignature::toString() const { std::string buf = name; @@ -137,4 +137,4 @@ void compareVectors( LOG(INFO) << "Two vectors match."; } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/utils/FuzzerToolkit.h b/velox/expression/fuzzer/FuzzerToolkit.h similarity index 97% rename from velox/expression/tests/utils/FuzzerToolkit.h rename to velox/expression/fuzzer/FuzzerToolkit.h index 0411d4aaecc12..9d78d0899c82d 100644 --- a/velox/expression/tests/utils/FuzzerToolkit.h +++ b/velox/expression/fuzzer/FuzzerToolkit.h @@ -18,7 +18,7 @@ #include "velox/expression/FunctionSignature.h" #include "velox/vector/ComplexVector.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer { // Represents one available function signature. struct CallableSignature { @@ -111,4 +111,4 @@ void compareVectors( const std::string& leftName = "left", const std::string& rightName = "right", const std::optional& rows = std::nullopt); -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/tests/SparkExpressionFuzzerTest.cpp b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp similarity index 95% rename from velox/expression/tests/SparkExpressionFuzzerTest.cpp rename to velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp index c9531632f4137..ffba105e2e08d 100644 --- a/velox/expression/tests/SparkExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp @@ -22,7 +22,7 @@ #include #include -#include "velox/expression/tests/FuzzerRunner.h" +#include "velox/expression/fuzzer/FuzzerRunner.h" #include "velox/functions/sparksql/Register.h" DEFINE_int64( @@ -31,7 +31,7 @@ DEFINE_int64( "Initial seed for random number generator " "(use it to reproduce previous results)."); -using facebook::velox::test::FuzzerRunner; +using facebook::velox::fuzzer::FuzzerRunner; int main(int argc, char** argv) { facebook::velox::functions::sparksql::registerFunctions(""); diff --git a/velox/expression/tests/ArgumentTypeFuzzerTest.cpp b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp similarity index 99% rename from velox/expression/tests/ArgumentTypeFuzzerTest.cpp rename to velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp index 595a09ccaf8d4..32971df22e0d6 100644 --- a/velox/expression/tests/ArgumentTypeFuzzerTest.cpp +++ b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h" +#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h" #include #include "velox/expression/SignatureBinder.h" #include "velox/type/Type.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer::test { namespace { const uint32_t kMaxVariadicArgs = 5; @@ -649,4 +649,4 @@ TEST_F(ArgumentTypeFuzzerTest, fuzzDecimalReturnType) { EXPECT_EQ(DECIMAL(10, 7)->toString(), returnType->toString()); } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer::test diff --git a/velox/expression/fuzzer/tests/CMakeLists.txt b/velox/expression/fuzzer/tests/CMakeLists.txt new file mode 100644 index 0000000000000..64af95929dcbc --- /dev/null +++ b/velox/expression/fuzzer/tests/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_expression_fuzzer_unit_test ArgumentTypeFuzzerTest.cpp DecimalArgGeneratorTest.cpp ExpressionFuzzerUnitTest.cpp) + +target_link_libraries( + velox_expression_fuzzer_unit_test + velox_expression_fuzzer + velox_functions_prestosql + velox_core + velox_expression + gtest + gtest_main) diff --git a/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp b/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp new file mode 100644 index 0000000000000..4de9c473ad016 --- /dev/null +++ b/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/expression/SignatureBinder.h" +#include "velox/expression/fuzzer/DecimalArgGeneratorBase.h" + +namespace facebook::velox::fuzzer::test { + +class DecimalArgGeneratorTest : public testing::Test { + protected: + class UnaryArgGenerator : public DecimalArgGeneratorBase { + public: + UnaryArgGenerator() { + initialize(1); + } + + protected: + std::optional> toReturnType(int p, int s) override { + auto precision = std::min(38, p + s + 1); + auto scale = std::min(s + 1, 18); + return {{precision, scale}}; + } + }; + + class BinaryArgGenerator : public DecimalArgGeneratorBase { + public: + BinaryArgGenerator() { + initialize(2); + } + + protected: + std::optional> + toReturnType(int p1, int s1, int p2, int s2) override { + auto s = std::max(s1, s2); + auto p = std::min(38, std::max(p1 - s1, p2 - s2) + std::max(s1, s2) + 1); + return {{p, s}}; + } + }; + + // Assert the equivalence between the given return type and the actual type + // resolved from generated argument types. + void assertReturnType( + const std::shared_ptr& generator, + const exec::FunctionSignature& signature, + const TypePtr& returnType) { + std::mt19937 seed{0}; + const auto argTypes = generator->generateArgs(signature, returnType, seed); + + // Resolve return type from argument types for the given signature. + TypePtr actualType; + exec::SignatureBinder binder(signature, argTypes); + if (binder.tryBind()) { + actualType = binder.tryResolveReturnType(); + } else { + VELOX_FAIL("Failed to resolve return type from argument types."); + } + EXPECT_TRUE(returnType->equivalent(*actualType)) + << "Expected type: " << returnType->toString() + << ", actual type: " << actualType->toString(); + } + + // Assert that no argument types can be generated for the given return type. + void assertEmptyArgs( + std::shared_ptr generator, + const exec::FunctionSignature& signature, + const TypePtr& returnType) { + std::mt19937 seed{0}; + const auto argTypes = generator->generateArgs(signature, returnType, seed); + EXPECT_TRUE(argTypes.empty()); + } +}; + +TEST_F(DecimalArgGeneratorTest, unary) { + auto signature = + exec::FunctionSignatureBuilder() + .integerVariable("scale") + .integerVariable("precision") + .integerVariable("r_precision", "min(38, precision + scale + 1)") + .integerVariable("r_scale", "min(scale + 1, 18)") + .returnType("decimal(r_precision, r_scale)") + .argumentType("decimal(precision, scale)") + .build(); + + const auto generator = std::make_shared(); + for (auto returnType : {DECIMAL(10, 2), DECIMAL(38, 18)}) { + assertReturnType(generator, *signature, returnType); + } + assertEmptyArgs(generator, *signature, DECIMAL(38, 20)); +} + +TEST_F(DecimalArgGeneratorTest, binary) { + auto signature = + exec::FunctionSignatureBuilder() + .integerVariable("a_scale") + .integerVariable("b_scale") + .integerVariable("a_precision") + .integerVariable("b_precision") + .integerVariable( + "r_precision", + "min(38, max(a_precision - a_scale, b_precision - b_scale) + max(a_scale, b_scale) + 1)") + .integerVariable("r_scale", "max(a_scale, b_scale)") + .returnType("decimal(r_precision, r_scale)") + .argumentType("decimal(a_precision, a_scale)") + .argumentType("decimal(b_precision, b_scale)") + .build(); + + const auto generator = std::make_shared(); + for (auto returnType : + {DECIMAL(10, 2), DECIMAL(38, 20), DECIMAL(38, 38), DECIMAL(38, 0)}) { + assertReturnType(generator, *signature, returnType); + } +} + +} // namespace facebook::velox::fuzzer::test diff --git a/velox/expression/tests/ExpressionFuzzerUnitTest.cpp b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp similarity index 97% rename from velox/expression/tests/ExpressionFuzzerUnitTest.cpp rename to velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp index 9fd3a09b3a047..2a00931336876 100644 --- a/velox/expression/tests/ExpressionFuzzerUnitTest.cpp +++ b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp @@ -16,10 +16,10 @@ #include -#include "velox/expression/tests/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" -namespace facebook::velox::test { +namespace facebook::velox::fuzzer::test { class ExpressionFuzzerUnitTest : public testing::Test { protected: static void SetUpTestCase() { @@ -199,4 +199,4 @@ TEST_F(ExpressionFuzzerUnitTest, exprBank) { } } -} // namespace facebook::velox::test +} // namespace facebook::velox::fuzzer::test diff --git a/velox/expression/tests/CMakeLists.txt b/velox/expression/tests/CMakeLists.txt index 6958e22f9276d..c58c7963834e3 100644 --- a/velox/expression/tests/CMakeLists.txt +++ b/velox/expression/tests/CMakeLists.txt @@ -12,11 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_subdirectory(utils) - add_executable( velox_expression_test - ArgumentTypeFuzzerTest.cpp ArrayViewTest.cpp ArrayWriterTest.cpp CastExprTest.cpp @@ -88,29 +85,6 @@ target_link_libraries( velox_expression_verifier velox_vector_test_lib velox_vector_fuzzer velox_type velox_expression_test_utility) -add_library(velox_expression_fuzzer ExpressionFuzzer.cpp FuzzerRunner.cpp - ExpressionFuzzerVerifier.cpp) - -target_link_libraries( - velox_expression_fuzzer - velox_expression_verifier - velox_type - velox_vector_fuzzer - velox_vector_test_lib - velox_function_registry - velox_expression_test_utility) - -add_executable(velox_expression_fuzzer_unit_test ExpressionFuzzerUnitTest.cpp) - -target_link_libraries( - velox_expression_fuzzer_unit_test - velox_expression_fuzzer - velox_functions_prestosql - velox_core - velox_expression - gtest - gtest_main) - add_library(velox_expression_runner ExpressionRunner.cpp) target_link_libraries( velox_expression_runner velox_expression_verifier velox_functions_prestosql @@ -150,13 +124,3 @@ target_link_libraries( velox_vector_test_lib gtest gtest_main) - -add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp) - -target_link_libraries(velox_expression_fuzzer_test velox_expression_fuzzer - velox_functions_prestosql gtest gtest_main) - -add_executable(spark_expression_fuzzer_test SparkExpressionFuzzerTest.cpp) - -target_link_libraries(spark_expression_fuzzer_test velox_expression_fuzzer - velox_functions_spark gtest gtest_main) diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index aa0b07ff9fb89..e7660ef058060 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -710,7 +710,6 @@ TEST_F(CastExprTest, dateToTimestamp) { } TEST_F(CastExprTest, timestampToDate) { - setTimezone(""); std::vector> inputTimestamps = { Timestamp(0, 0), Timestamp(946684800, 0), @@ -766,6 +765,10 @@ TEST_F(CastExprTest, timestampInvalid) { } TEST_F(CastExprTest, timestampAdjustToTimezone) { + // Empty timezone is assumed to be GMT. + testCast( + "timestamp", {"1970-01-01"}, {Timestamp(0, 0)}); + setTimezone("America/Los_Angeles"); // Expect unix epochs to be converted to LA timezone (8h offset). @@ -789,21 +792,10 @@ TEST_F(CastExprTest, timestampAdjustToTimezone) { std::nullopt, Timestamp(957164400, 0), }); - - // Empty timezone is assumed to be GMT. - setTimezone(""); - testCast( - "timestamp", {"1970-01-01"}, {Timestamp(0, 0)}); } TEST_F(CastExprTest, timestampAdjustToTimezoneInvalid) { - auto testFunc = [&]() { - testCast( - "timestamp", {"1970-01-01"}, {Timestamp(1, 0)}); - }; - - setTimezone("bla"); - EXPECT_THROW(testFunc(), std::runtime_error); + VELOX_ASSERT_USER_THROW(setTimezone("bla"), "Unknown time zone: 'bla'"); } TEST_F(CastExprTest, date) { diff --git a/velox/expression/tests/ExprStatsTest.cpp b/velox/expression/tests/ExprStatsTest.cpp index b892b8d575702..70394b4ce9176 100644 --- a/velox/expression/tests/ExprStatsTest.cpp +++ b/velox/expression/tests/ExprStatsTest.cpp @@ -158,10 +158,8 @@ struct Event { class TestListener : public exec::ExprSetListener { public: - explicit TestListener( - std::vector& events, - std::vector& exceptions) - : events_{events}, exceptions_{exceptions}, exceptionCount_{0} {} + explicit TestListener(std::vector& events) + : events_{events}, exceptionCount_{0} {} void onCompletion( const std::string& uuid, @@ -169,21 +167,8 @@ class TestListener : public exec::ExprSetListener { events_.push_back({uuid, event.stats, event.sqls}); } - void onError( - const SelectivityVector& rows, - const ::facebook::velox::ErrorVector& errors, - const std::string& /*queryId*/) override { - rows.applyToSelected([&](auto row) { - exceptionCount_++; - - try { - auto exception = - *std::static_pointer_cast(errors.valueAt(row)); - std::rethrow_exception(exception); - } catch (const std::exception& e) { - exceptions_.push_back(e.what()); - } - }); + void onError(vector_size_t numRows, const std::string& /*queryId*/) override { + exceptionCount_ += numRows; } int exceptionCount() const { @@ -193,12 +178,10 @@ class TestListener : public exec::ExprSetListener { void reset() { exceptionCount_ = 0; events_.clear(); - exceptions_.clear(); } private: std::vector& events_; - std::vector& exceptions_; int exceptionCount_; }; @@ -207,8 +190,7 @@ TEST_F(ExprStatsTest, listener) { // Register a listener to receive stats on ExprSet destruction. std::vector events; - std::vector exceptions; - auto listener = std::make_shared(events, exceptions); + auto listener = std::make_shared(events); ASSERT_TRUE(exec::registerExprSetListener(listener)); ASSERT_FALSE(exec::registerExprSetListener(listener)); @@ -308,8 +290,7 @@ TEST_F(ExprStatsTest, specialForms) { // Register a listener to receive stats on ExprSet destruction. std::vector events; - std::vector exceptions; - auto listener = std::make_shared(events, exceptions); + auto listener = std::make_shared(events); ASSERT_TRUE(exec::registerExprSetListener(listener)); auto data = makeRowVector({ @@ -367,8 +348,7 @@ TEST_F(ExprStatsTest, specialForms) { TEST_F(ExprStatsTest, errorLog) { // Register a listener to log exceptions. std::vector events; - std::vector exceptions; - auto listener = std::make_shared(events, exceptions); + auto listener = std::make_shared(events); ASSERT_TRUE(exec::registerExprSetListener(listener)); auto data = makeRowVector( @@ -386,14 +366,6 @@ TEST_F(ExprStatsTest, errorLog) { // Expect errors at rows 2 and 4. ASSERT_EQ(2, listener->exceptionCount()); - ASSERT_EQ(2, exceptions.size()); - for (const auto& exception : exceptions) { - ASSERT_TRUE( - exception.find("Context: cast((c0) as INTEGER)") != std::string::npos); - ASSERT_TRUE( - exception.find("Error Code: INVALID_ARGUMENT") != std::string::npos); - ASSERT_TRUE(exception.find("Stack trace:") != std::string::npos); - } // Test with multiple try expressions. Expect errors at rows 1, 2, 4, and 6. // The second row in c1 does not cause an additional error because the @@ -405,7 +377,6 @@ TEST_F(ExprStatsTest, errorLog) { evaluate(*exprSet, data); ASSERT_EQ(4, listener->exceptionCount()); - ASSERT_EQ(4, exceptions.size()); // Test with nested try expressions. Expect errors at rows 2, 3, 4, and 6. Row // 5 in c2 does not cause an error because the corresponding row in c0 is @@ -416,15 +387,6 @@ TEST_F(ExprStatsTest, errorLog) { evaluate(*exprSet, data); ASSERT_EQ(4, listener->exceptionCount()); - ASSERT_EQ(4, exceptions.size()); - ASSERT_TRUE( - exceptions[0].find("Error Code: INVALID_ARGUMENT") != std::string::npos); - ASSERT_TRUE( - exceptions[1].find("Error Code: INVALID_ARGUMENT") != std::string::npos); - ASSERT_TRUE( - exceptions[2].find("Error Code: ARITHMETIC_ERROR") != std::string::npos); - ASSERT_TRUE( - exceptions[3].find("Error Code: ARITHMETIC_ERROR") != std::string::npos); // Test with no error. listener->reset(); @@ -432,7 +394,6 @@ TEST_F(ExprStatsTest, errorLog) { evaluate(*exprSet, data); ASSERT_EQ(0, listener->exceptionCount()); - ASSERT_EQ(0, exceptions.size()); ASSERT_TRUE(exec::unregisterExprSetListener(listener)); } @@ -442,8 +403,7 @@ TEST_F(ExprStatsTest, complexConstants) { // '__complex_constant(c#)' pseudo functions. std::vector events; - std::vector exceptions; - auto listener = std::make_shared(events, exceptions); + auto listener = std::make_shared(events); ASSERT_TRUE(exec::registerExprSetListener(listener)); std::vector expressions = { @@ -456,7 +416,7 @@ TEST_F(ExprStatsTest, complexConstants) { } ASSERT_EQ(1, events.size()); - ASSERT_EQ(0, exceptions.size()); + ASSERT_EQ(0, listener->exceptionCount()); ASSERT_EQ(1, events[0].sqls.size()); ASSERT_EQ("__complex_constant(c0)", events[0].sqls[0]); diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 52f6abc4d4bb1..79c28eb99dae6 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -267,7 +267,7 @@ class ExprTest : public testing::Test, public VectorTestBase { VELOX_CHECK(startPos != std::string::npos); startPos += strlen(key); auto endPos = context.find(".", startPos); - VELOX_CHECK(endPos != std::string::npos); + VELOX_CHECK(endPos != std::string::npos, context); return context.substr(startPos, endPos - startPos); } @@ -298,15 +298,16 @@ class ExprTest : public testing::Test, public VectorTestBase { } void verifyDataAndSqlPaths(const VeloxException& e, const VectorPtr& data) { - auto inputPath = extractInputPath(e.topLevelContext()); + auto inputPath = extractInputPath(e.additionalContext()); auto copy = restoreVector(inputPath); assertEqualVectors(data, copy); - auto sqlPath = extractSqlPath(e.topLevelContext()); + auto sqlPath = extractSqlPath(e.additionalContext()); auto sql = readSqlFromFile(sqlPath); ASSERT_NO_THROW(compileExpression(sql, asRowType(data->type()))); - auto allSqlsPath = extractAllExprSqlPath(e.topLevelContext()); + LOG(ERROR) << e.additionalContext(); + auto allSqlsPath = extractAllExprSqlPath(e.additionalContext()); auto allSqls = readSqlFromFile(allSqlsPath); ASSERT_NO_THROW(compileMultipleExprs(allSqls, asRowType(data->type()))); } @@ -334,20 +335,22 @@ class ExprTest : public testing::Test, public VectorTestBase { return sql; } - void assertError( + std::exception_ptr assertError( const std::string& expression, const VectorPtr& input, const std::string& context, - const std::string& topLevelContext, + const std::string& additionalContext, const std::string& message) { try { evaluate(expression, makeRowVector({input})); - ASSERT_TRUE(false) << "Expected an error"; + EXPECT_TRUE(false) << "Expected an error"; } catch (VeloxException& e) { - ASSERT_EQ(message, e.message()); - ASSERT_EQ(context, trimInputPath(e.context())); - ASSERT_EQ(topLevelContext, trimInputPath(e.topLevelContext())); + EXPECT_EQ(context, trimInputPath(e.context())); + EXPECT_EQ(additionalContext, trimInputPath(e.additionalContext())); + EXPECT_EQ(message, e.message()); + return e.wrappedException(); } + return nullptr; } void assertErrorSimplified( @@ -369,14 +372,14 @@ class ExprTest : public testing::Test, public VectorTestBase { const std::string& expression, const VectorPtr& input, const std::string& context, - const std::string& topLevelContext, + const std::string& additionalContext, const std::string& message) { try { evaluate(expression, makeRowVector({input})); EXPECT_TRUE(false) << "Expected an error"; } catch (VeloxException& e) { EXPECT_EQ(context, trimInputPath(e.context())); - EXPECT_EQ(topLevelContext, trimInputPath(e.topLevelContext())); + EXPECT_EQ(additionalContext, trimInputPath(e.additionalContext())); EXPECT_EQ(message, e.message()); return e.wrappedException(); } @@ -2410,7 +2413,9 @@ TEST_P(ParameterizedExprTest, exceptionContext) { FAIL() << "Expected an exception"; } catch (const VeloxException& e) { ASSERT_EQ("always_throws(c0)", e.context()); - ASSERT_EQ("plus(always_throws(c0), c1)", e.topLevelContext()); + ASSERT_EQ( + "Top-level Expression: plus(always_throws(c0), c1)", + e.additionalContext()); } try { @@ -2419,8 +2424,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", - e.topLevelContext()); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", + e.additionalContext()); } try { @@ -2429,8 +2434,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", - e.topLevelContext()); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", + e.additionalContext()); } // Enable saving vector and expression SQL for system errors only. @@ -2444,7 +2449,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("runtime_error(c0)", e.context()); ASSERT_EQ( - "plus(runtime_error(c0), c1)", trimInputPath(e.topLevelContext())); + "Top-level Expression: plus(runtime_error(c0), c1)", + trimInputPath(e.additionalContext())); verifyDataAndSqlPaths(e, data); } @@ -2454,8 +2460,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", - e.topLevelContext()) + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", + e.additionalContext()) << e.errorSource(); } @@ -2465,8 +2471,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", - e.topLevelContext()); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", + e.additionalContext()); } // Enable saving vector and expression SQL for all errors. @@ -2480,7 +2486,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("always_throws(c0)", e.context()); ASSERT_EQ( - "plus(always_throws(c0), c1)", trimInputPath(e.topLevelContext())); + "Top-level Expression: plus(always_throws(c0), c1)", + trimInputPath(e.additionalContext())); verifyDataAndSqlPaths(e, data); } @@ -2490,8 +2497,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", - trimInputPath(e.topLevelContext())); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))", + trimInputPath(e.additionalContext())); verifyDataAndSqlPaths(e, data); } @@ -2501,8 +2508,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", - trimInputPath(e.topLevelContext())); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", + trimInputPath(e.additionalContext())); verifyDataAndSqlPaths(e, data); } @@ -2512,8 +2519,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) { } catch (const VeloxException& e) { ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context()); ASSERT_EQ( - "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", - trimInputPath(e.topLevelContext())); + "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))", + trimInputPath(e.additionalContext())); verifyDataAndSqlPaths(e, data); } } @@ -2536,19 +2543,19 @@ TEST_P(ParameterizedExprTest, stdExceptionContext) { registerFunction( {"throw_invalid_argument"}); - auto wrappedEx = assertWrappedException( + auto wrappedEx = assertError( "throw_invalid_argument(c0) + 5", data, "throw_invalid_argument(c0)", - "plus(throw_invalid_argument(c0), 5:BIGINT)", + "Top-level Expression: plus(throw_invalid_argument(c0), 5:BIGINT)", "This is a test"); ASSERT_THROW(std::rethrow_exception(wrappedEx), std::invalid_argument); - wrappedEx = assertWrappedException( + wrappedEx = assertError( "throw_invalid_argument(c0 + 5)", data, - "throw_invalid_argument(plus(c0, 5:BIGINT))", - "Same as context.", + "Top-level Expression: throw_invalid_argument(plus(c0, 5:BIGINT))", + "", "This is a test"); ASSERT_THROW(std::rethrow_exception(wrappedEx), std::invalid_argument); } @@ -2947,15 +2954,15 @@ TEST_P(ParameterizedExprTest, castExceptionContext) { assertError( "cast(c0 as bigint)", makeFlatVector({"1a"}), - "cast((c0) as BIGINT)", - "Same as context.", + "Top-level Expression: cast((c0) as BIGINT)", + "", "Cannot cast VARCHAR '1a' to BIGINT. Non-whitespace character found after end of conversion: \"a\""); assertError( "cast(c0 as timestamp)", makeFlatVector(std::vector{1}), - "cast((c0) as TIMESTAMP)", - "Same as context.", + "Top-level Expression: cast((c0) as TIMESTAMP)", + "", "Cannot cast TINYINT '1' to TIMESTAMP. Conversion to Timestamp is not supported"); } @@ -2964,7 +2971,7 @@ TEST_P(ParameterizedExprTest, switchExceptionContext) { "case c0 when 7 then c0 / 0 else 0 end", makeFlatVector(std::vector{7}), "divide(c0, 0:BIGINT)", - "switch(eq(c0, 7:BIGINT), divide(c0, 0:BIGINT), 0:BIGINT)", + "Top-level Expression: switch(eq(c0, 7:BIGINT), divide(c0, 0:BIGINT), 0:BIGINT)", "division by zero"); } @@ -2975,7 +2982,7 @@ TEST_P(ParameterizedExprTest, conjunctExceptionContext) { "if (c0 % 409 < 300 and c0 / 0 < 30, 1, 2)", data, "divide(c0, 0:BIGINT)", - "switch(and(lt(mod(c0, 409:BIGINT), 300:BIGINT), lt(divide(c0, 0:BIGINT), 30:BIGINT)), 1:BIGINT, 2:BIGINT)", + "Top-level Expression: switch(and(lt(mod(c0, 409:BIGINT), 300:BIGINT), lt(divide(c0, 0:BIGINT), 30:BIGINT)), 1:BIGINT, 2:BIGINT)", "division by zero"); } @@ -2987,7 +2994,7 @@ TEST_P(ParameterizedExprTest, lambdaExceptionContext) { "filter(c0, x -> (x / 0 > 1))", array, "divide(x, 0:BIGINT)", - "filter(c0, (x) -> gt(divide(x, 0:BIGINT), 1:BIGINT))", + "Top-level Expression: filter(c0, (x) -> gt(divide(x, 0:BIGINT), 1:BIGINT))", "division by zero"); } @@ -3529,7 +3536,7 @@ TEST_P(ParameterizedExprTest, applyFunctionNoResult) { "always_throws_vector_function(c0) AND true", makeFlatVector({1, 2, 3}), "always_throws_vector_function(c0)", - "and(always_throws_vector_function(c0), true:BOOLEAN)", + "Top-level Expression: and(always_throws_vector_function(c0), true:BOOLEAN)", TestingAlwaysThrowsVectorFunction::kVeloxErrorMessage); exec::registerVectorFunction( @@ -3541,7 +3548,7 @@ TEST_P(ParameterizedExprTest, applyFunctionNoResult) { "no_op(c0) AND true", makeFlatVector({1, 2, 3}), "no_op(c0)", - "and(no_op(c0), true:BOOLEAN)", + "Top-level Expression: and(no_op(c0), true:BOOLEAN)", "Function neither returned results nor threw exception."); } @@ -3671,8 +3678,8 @@ TEST_P(ParameterizedExprTest, stdExceptionInVectorFunction) { assertError( "always_throws_vector_function(c0)", makeFlatVector({1, 2, 3}), - "always_throws_vector_function(c0)", - "Same as context.", + "Top-level Expression: always_throws_vector_function(c0)", + "", TestingAlwaysThrowsVectorFunction::kStdErrorMessage); assertErrorSimplified( diff --git a/velox/expression/tests/ExpressionRunnerUnitTest.cpp b/velox/expression/tests/ExpressionRunnerUnitTest.cpp index a6107543885be..2c56715b197f3 100644 --- a/velox/expression/tests/ExpressionRunnerUnitTest.cpp +++ b/velox/expression/tests/ExpressionRunnerUnitTest.cpp @@ -15,12 +15,12 @@ */ #include -#include "FuzzerRunner.h" #include "velox/dwio/common/tests/utils/BatchMaker.h" #include "velox/exec/tests/utils/TempFilePath.h" #include "velox/expression/Expr.h" #include "velox/expression/SignatureBinder.h" -#include "velox/expression/tests/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" +#include "velox/expression/fuzzer/FuzzerRunner.h" #include "velox/expression/tests/ExpressionRunner.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" #include "velox/vector/VectorSaver.h" diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp index 9fd070a8657d6..1d3b74cf82fb1 100644 --- a/velox/expression/tests/ExpressionVerifier.cpp +++ b/velox/expression/tests/ExpressionVerifier.cpp @@ -41,7 +41,7 @@ void logRowVector(const RowVectorPtr& rowVector) { } } // namespace -ResultOrError ExpressionVerifier::verify( +fuzzer::ResultOrError ExpressionVerifier::verify( const std::vector& plans, const RowVectorPtr& rowVector, VectorPtr&& resultVector, @@ -127,7 +127,7 @@ ResultOrError ExpressionVerifier::verify( if (copiedInput) { // Flatten the input vector as an optimization if its very deeply nested. - compareVectors( + fuzzer::compareVectors( copiedInput, BaseVector::copy(*inputRowVector), "Copy of original input", @@ -162,7 +162,7 @@ ResultOrError ExpressionVerifier::verify( exprSetSimplified.eval(rows, evalCtxSimplified, simplifiedEvalResult); // Flatten the input vector as an optimization if its very deeply nested. - compareVectors( + fuzzer::compareVectors( copy, BaseVector::copy(*rowVector), "Copy of original input", @@ -183,14 +183,14 @@ ResultOrError ExpressionVerifier::verify( if (exceptionCommonPtr || exceptionSimplifiedPtr) { // Throws in case exceptions are not compatible. If they are compatible, // return false to signal that the expression failed. - compareExceptions(exceptionCommonPtr, exceptionSimplifiedPtr); + fuzzer::compareExceptions(exceptionCommonPtr, exceptionSimplifiedPtr); return {nullptr, exceptionCommonPtr}; } else { // Throws in case output is different. VELOX_CHECK_EQ(commonEvalResult.size(), plans.size()); VELOX_CHECK_EQ(simplifiedEvalResult.size(), plans.size()); for (int i = 0; i < plans.size(); ++i) { - compareVectors( + fuzzer::compareVectors( commonEvalResult[i], simplifiedEvalResult[i], "common path results ", @@ -431,7 +431,7 @@ class MinimalSubExpressionFinder { results ? BaseVector::copy(*results) : nullptr, true, // canThrow columnsToWrapInLazy); - } catch (const std::exception& e) { + } catch (const std::exception&) { success = false; } FLAGS_minloglevel = 0; diff --git a/velox/expression/tests/ExpressionVerifier.h b/velox/expression/tests/ExpressionVerifier.h index be7cb52e680cb..240768b2aa68f 100644 --- a/velox/expression/tests/ExpressionVerifier.h +++ b/velox/expression/tests/ExpressionVerifier.h @@ -18,7 +18,7 @@ #include "velox/core/ITypedExpr.h" #include "velox/core/QueryCtx.h" -#include "velox/expression/tests/utils/FuzzerToolkit.h" +#include "velox/expression/fuzzer/FuzzerToolkit.h" #include "velox/functions/FunctionRegistry.h" #include "velox/type/Type.h" #include "velox/vector/BaseVector.h" @@ -63,7 +63,7 @@ class ExpressionVerifier { // - exception thrown by the common path if both paths failed with compatible // exceptions. // - throws otherwise (incompatible exceptions or different results). - ResultOrError verify( + fuzzer::ResultOrError verify( const std::vector& plans, const RowVectorPtr& rowVector, VectorPtr&& resultVector, diff --git a/velox/expression/tests/SimpleFunctionTest.cpp b/velox/expression/tests/SimpleFunctionTest.cpp index 63143e7f06a81..cd76de6bf3a67 100644 --- a/velox/expression/tests/SimpleFunctionTest.cpp +++ b/velox/expression/tests/SimpleFunctionTest.cpp @@ -19,8 +19,10 @@ #include #include +#include #include "folly/lang/Hint.h" -#include "gtest/gtest.h" + +#include "velox/common/base/tests/GTestUtils.h" #include "velox/expression/Expr.h" #include "velox/expression/SimpleFunctionAdapter.h" #include "velox/functions/Udf.h" @@ -1135,7 +1137,7 @@ struct StringInputIntOutputFunction { } }; -TEST_F(SimpleFunctionTest, TestcallAscii) { +TEST_F(SimpleFunctionTest, callAscii) { registerFunction( {"get_input_size"}); auto asciiInput = makeFlatVector({"abc123", "10% #\0"}); @@ -1478,4 +1480,57 @@ TEST_F(SimpleFunctionTest, decimalsWithConstraints) { } } +template +struct NoThrowFunction { + VELOX_DEFINE_FUNCTION_TYPES(TExec); + + Status call(out_type& out, const arg_type& in) { + if (in % 3 != 0) { + return Status::UserError("Input must be divisible by 3"); + } + + // Throwing exceptions is not recommended, but allowed. + VELOX_USER_CHECK(in % 2 == 0, "Input must be even"); + + if (in == 6) { + return Status::UnknownError("Input must not be 6"); + } + + out = in / 6; + return Status::OK(); + } +}; + +TEST_F(SimpleFunctionTest, noThrow) { + registerFunction({"no_throw"}); + + auto result = evaluateOnce("no_throw(c0)", 12); + EXPECT_EQ(2, result); + + // Errors reported via Status. + VELOX_ASSERT_THROW( + (evaluateOnce("no_throw(c0)", 10)), + "Input must be divisible by 3"); + + result = evaluateOnce("try(no_throw(c0))", 10); + EXPECT_EQ(std::nullopt, result); + + // Errors reported by throwing exceptions. + VELOX_ASSERT_THROW( + (evaluateOnce("no_throw(c0)", 15)), + "Input must be even"); + + result = evaluateOnce("try(no_throw(c0))", 15); + EXPECT_EQ(std::nullopt, result); + + // Non-user errors cannot be suppressed by TRY. + VELOX_ASSERT_THROW( + (evaluateOnce("no_throw(c0)", 6)), + "Input must not be 6"); + + VELOX_ASSERT_THROW( + (evaluateOnce("try(no_throw(c0))", 6)), + "Input must not be 6"); +} + } // namespace diff --git a/velox/flag_definitions/flags.cpp b/velox/flag_definitions/flags.cpp index d25ae44713f1b..b26eeb5d4c704 100644 --- a/velox/flag_definitions/flags.cpp +++ b/velox/flag_definitions/flags.cpp @@ -30,7 +30,7 @@ DEFINE_int32( DEFINE_bool( velox_time_allocations, - true, + false, "Record time and volume for large allocation/free"); // Used in common/base/VeloxException.cpp diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index 451640ef20252..c8fdfd80f160e 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -217,7 +217,7 @@ class Re2MatchConstantPattern final : public exec::VectorFunction { exec::LocalDecodedVector toSearch(context, *args[0], rows); try { checkForBadPattern(re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } @@ -288,7 +288,7 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction { // apply() will not be invoked if the selection is empty. try { checkForBadPattern(re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } @@ -312,7 +312,7 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction { if (const auto groupId = getIfConstant(*args[2])) { try { checkForBadGroupId(*groupId, re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } @@ -825,7 +825,7 @@ class LikeWithRe2 final : public exec::VectorFunction { // apply() will not be invoked if the selection is empty. try { checkForBadPattern(*re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } @@ -1058,7 +1058,7 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction { VELOX_CHECK(args.size() == 2 || args.size() == 3); try { checkForBadPattern(re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } @@ -1083,7 +1083,7 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction { // try { checkForBadGroupId(*_groupId, re_); - } catch (const std::exception& e) { + } catch (const std::exception&) { context.setErrors(rows, std::current_exception()); return; } diff --git a/velox/functions/lib/RegistrationHelpers.h b/velox/functions/lib/RegistrationHelpers.h index e411057d8cd29..cea30c50a2a1e 100644 --- a/velox/functions/lib/RegistrationHelpers.h +++ b/velox/functions/lib/RegistrationHelpers.h @@ -73,6 +73,14 @@ void registerUnaryIntegral(const std::vector& aliases) { registerFunction(aliases); } +template