Merge branch 'branch-25.02' into prevent-pylibcudf-serialization

rapidsai · Nov 29, 2024 · 2ba7ed1 · 2ba7ed1
2 parents 3e14ec9 + b084d74
commit 2ba7ed1
Show file tree

Hide file tree

Showing 134 changed files with 3,702 additions and 3,671 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -2,3 +2,4 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -0,0 +1,26 @@
+name: Trigger Breaking Change Notifications
+
+on:
+  pull_request_target:
+    types:
+      - closed
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  trigger-notifier:
+    if: contains(github.event.pull_request.labels.*.name, 'breaking')
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      sender_login: ${{ github.event.sender.login }}
+      sender_avatar: ${{ github.event.sender.avatar_url }}
+      repo: ${{ github.repository }}
+      pr_number: ${{ github.event.pull_request.number }}
+      pr_title: "${{ github.event.pull_request.title }}"
+      pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}"
+      pr_base_ref: ${{ github.event.pull_request.base.ref }}
+      pr_author: ${{ github.event.pull_request.user.login }}
+      event_action: ${{ github.event.action }}
+      pr_merged: ${{ github.event.pull_request.merged }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -926,9 +926,16 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool> spdlog::spdlog_header_only
-  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
-          kvikio::kvikio $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}> nanoarrow
+  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
+         spdlog::spdlog_header_only
+  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp>
+          cuco::cuco
+          ZLIB::ZLIB
+          nvcomp::nvcomp
+          kvikio::kvikio
+          $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}>
+          nanoarrow
+          rmm::rmm_logger_impl
 )
 
 # Add Conda library, and include paths if specified
@@ -1007,7 +1014,7 @@ if(CUDF_BUILD_TESTUTIL)
   )
 
   target_link_libraries(
-    cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream
+    cudftestutil INTERFACE cuco::cuco Threads::Threads cudf cudftest_default_stream
                            $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -16,11 +16,7 @@
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
 
-  if(BUILD_SHARED_LIBS)
-    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
-  else()
-    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
-  endif()
+  rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 endfunction()
 
 find_and_configure_cucollections()
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
@@ -16,6 +16,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 # Configure your project here
 add_executable(basic_example src/process_csv.cpp)
 target_link_libraries(basic_example PRIVATE cudf::cudf)

diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt
@@ -16,6 +16,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 add_library(groupby_results OBJECT groupby_results.cpp)

diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
@@ -15,6 +15,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 # The Arrow CMake is currently broken if the build type is not set
 set(CMAKE_BUILD_TYPE Release)
 # No need to install Arrow libs when only the final example executable is shipped.

diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
@@ -16,6 +16,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 # Configure your project here
 add_executable(deduplication deduplication.cpp)
 target_link_libraries(deduplication PRIVATE cudf::cudf)

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,6 +16,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp)
 target_compile_features(parquet_io_utils PRIVATE cxx_std_17)
 target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)

diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
@@ -16,6 +16,9 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+include(rapids-cmake)
+rapids_cmake_build_type("Release")
+
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 add_executable(libcudf_apis libcudf_apis.cpp)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -25,8 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
-
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace detail {
@@ -164,7 +162,7 @@ template <int32_t block_size, int32_t leader_lane = 0, typename T>
 __device__ T single_lane_block_sum_reduce(T lane_value)
 {
   static_assert(block_size <= 1024, "Invalid block size.");
-  static_assert(std::is_arithmetic_v<T>, "Invalid non-arithmetic type.");
+  static_assert(cuda::std::is_arithmetic_v<T>, "Invalid non-arithmetic type.");
   constexpr auto warps_per_block{block_size / warp_size};
   auto const lane_id{threadIdx.x % warp_size};
   auto const warp_id{threadIdx.x / warp_size};

diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,157 +24,83 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cstddef>
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
 
 namespace cudf::hashing::detail {
 
-// MurmurHash3_x86_32 implementation from
-// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
 template <typename Key>
 struct MurmurHash3_x86_32 {
   using result_type = hash_value_type;
 
-  constexpr MurmurHash3_x86_32() = default;
-  constexpr MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
+  __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
+    : _impl{seed}
   {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
   }
 
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto const block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
   {
-    return compute(normalize_nans_and_zeros(key));
+    return this->_impl.compute_hash(bytes, size);
   }
 
+ private:
   template <typename T>
-  result_type __device__ inline compute(T const& key) const
+  __device__ constexpr result_type compute(T const& key) const
   {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
   }
 
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk.
-    uint32_t k1 = 0;
-    switch (len % 4) {
-      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; [[fallthrough]];
-      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; [[fallthrough]];
-      case 1:
-        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
-        k1 *= c1;
-        k1 = rotate_bits_left(k1, rot_c1);
-        k1 *= c2;
-        h ^= k1;
-    };
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(h, rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
+  cuco::murmurhash3_32<Key> _impl;
 };
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<bool>::operator()(bool const& key) const
 {
-  return compute(static_cast<uint8_t>(key));
+  return this->compute(static_cast<uint8_t>(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<float>::operator()(float const& key) const
 {
-  return compute(normalize_nans_and_zeros(key));
+  return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<double>::operator()(double const& key) const
 {
-  return compute(normalize_nans_and_zeros(key));
+  return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
   cudf::string_view const& key) const
 {
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>