From 7575e8da54499990b51535a0f975acd02a493144 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 12 May 2023 15:37:56 -0700 Subject: [PATCH] Fix `contiguous_split` performance (#13342) This fixes a performance issue in `contiguous_split` that is due to `pack_metadata` not being implemented by an efficient way. In particular, the output bytes are copied from the internal buffer to the output buffer byte-by-byte, through `std::back_inserter`: ``` std::copy(metadata_begin, metadata_begin + (metadata.size() * sizeof(detail::serialized_column)), std::back_inserter(metadata_bytes)); ``` This was probably optimized somehow by the compiler, but recent refactors made some changes to the code and probably prevent such optimization. ### Benchmark Latest cudf commit: ``` ---------------------------------------------------------------------------------------------------------------------------------------------------- ContiguousSplit/6Gb512ColsNoValidity/6442450944/512/256/0/iterations:8/manual_time 46.1 ms 46.1 ms 8 bytes_per_second=260.086G/s ContiguousSplit/6Gb512ColsValidity/6442450944/512/256/1/iterations:8/manual_time 48.1 ms 48.0 ms 8 bytes_per_second=257.527G/s ContiguousSplit/6Gb10ColsNoValidity/6442450944/10/256/0/iterations:8/manual_time 27.4 ms 27.4 ms 8 bytes_per_second=438.188G/s ContiguousSplit/6Gb10ColsValidity/6442450944/10/256/1/iterations:8/manual_time 28.5 ms 28.5 ms 8 bytes_per_second=434.381G/s ContiguousSplit/4Gb512ColsNoValidity/4294967296/512/256/0/iterations:8/manual_time 34.5 ms 34.5 ms 8 bytes_per_second=231.825G/s ContiguousSplit/4Gb512ColsValidity/4294967296/512/256/1/iterations:8/manual_time 37.4 ms 37.4 ms 8 bytes_per_second=220.521G/s ContiguousSplit/4Gb10ColsNoValidity/4294967296/10/256/0/iterations:8/manual_time 18.9 ms 18.9 ms 8 bytes_per_second=422.259G/s ContiguousSplit/4Gb10ColsValidity/4294967296/10/256/1/iterations:8/manual_time 19.4 ms 19.4 ms 8 bytes_per_second=424.595G/s ContiguousSplit/4Gb4ColsNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.35 ms 4.35 ms 8 bytes_per_second=474.47G/s ContiguousSplit/4Gb4ColsValidityNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.35 ms 4.36 ms 8 bytes_per_second=473.665G/s ContiguousSplit/1Gb512ColsNoValidity/1073741824/512/256/0/iterations:8/manual_time 22.2 ms 22.2 ms 8 bytes_per_second=90.1502G/s ContiguousSplit/1Gb512ColsValidity/1073741824/512/256/1/iterations:8/manual_time 25.1 ms 25.1 ms 8 bytes_per_second=82.1379G/s ContiguousSplit/1Gb10ColsNoValidity/1073741824/10/256/0/iterations:8/manual_time 5.08 ms 5.08 ms 8 bytes_per_second=393.98G/s ContiguousSplit/1Gb10ColsValidity/1073741824/10/256/1/iterations:8/manual_time 5.28 ms 5.28 ms 8 bytes_per_second=390.85G/s ContiguousSplit/1Gb1ColNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.34 ms 4.35 ms 8 bytes_per_second=474.715G/s ContiguousSplit/1Gb1ColValidityNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.47 ms 4.47 ms 8 bytes_per_second=461.788G/s ContiguousSplitStrings/4Gb512ColsNoValidity/4294967296/512/256/0/iterations:8/manual_time 98.1 ms 98.0 ms 8 bytes_per_second=81.6345G/s ContiguousSplitStrings/4Gb512ColsValidity/4294967296/512/256/1/iterations:8/manual_time 89.5 ms 89.5 ms 8 bytes_per_second=90.843G/s ContiguousSplitStrings/4Gb10ColsNoValidity/4294967296/10/256/0/iterations:8/manual_time 28.9 ms 29.9 ms 8 bytes_per_second=290.261G/s ContiguousSplitStrings/4Gb10ColsValidity/4294967296/10/256/1/iterations:8/manual_time 20.4 ms 20.4 ms 8 bytes_per_second=417.033G/s ContiguousSplitStrings/4Gb4ColsNoSplits/1073741824/4/0/0/iterations:8/manual_time 6.70 ms 7.32 ms 8 bytes_per_second=335.9G/s ContiguousSplitStrings/4Gb4ColsValidityNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.35 ms 4.36 ms 8 bytes_per_second=524.386G/s ContiguousSplitStrings/1Gb512ColsNoValidity/1073741824/512/256/0/iterations:8/manual_time 77.8 ms 77.8 ms 8 bytes_per_second=25.7184G/s ContiguousSplitStrings/1Gb512ColsValidity/1073741824/512/256/1/iterations:8/manual_time 79.2 ms 79.1 ms 8 bytes_per_second=25.6833G/s ContiguousSplitStrings/1Gb10ColsNoValidity/1073741824/10/256/0/iterations:8/manual_time 8.57 ms 8.81 ms 8 bytes_per_second=245.062G/s ContiguousSplitStrings/1Gb10ColsValidity/1073741824/10/256/1/iterations:8/manual_time 7.83 ms 6.15 ms 8 bytes_per_second=272.089G/s ContiguousSplitStrings/1Gb1ColNoSplits/1073741824/1/0/0/iterations:8/manual_time 6.66 ms 9.17 ms 8 bytes_per_second=450.551G/s ContiguousSplitStrings/1Gb1ColValidityNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.41 ms 4.41 ms 8 bytes_per_second=687.88G/s ``` With this fix: ``` ---------------------------------------------------------------------------------------------------------------------------------------------------- ContiguousSplit/6Gb512ColsNoValidity/6442450944/512/256/0/iterations:8/manual_time 38.5 ms 38.4 ms 8 bytes_per_second=311.981G/s ContiguousSplit/6Gb512ColsValidity/6442450944/512/256/1/iterations:8/manual_time 42.8 ms 42.7 ms 8 bytes_per_second=289.289G/s ContiguousSplit/6Gb10ColsNoValidity/6442450944/10/256/0/iterations:8/manual_time 27.6 ms 27.5 ms 8 bytes_per_second=435.365G/s ContiguousSplit/6Gb10ColsValidity/6442450944/10/256/1/iterations:8/manual_time 28.4 ms 28.3 ms 8 bytes_per_second=436.145G/s ContiguousSplit/4Gb512ColsNoValidity/4294967296/512/256/0/iterations:8/manual_time 27.2 ms 27.2 ms 8 bytes_per_second=293.677G/s ContiguousSplit/4Gb512ColsValidity/4294967296/512/256/1/iterations:8/manual_time 29.9 ms 29.9 ms 8 bytes_per_second=276.137G/s ContiguousSplit/4Gb10ColsNoValidity/4294967296/10/256/0/iterations:8/manual_time 19.0 ms 19.0 ms 8 bytes_per_second=421.185G/s ContiguousSplit/4Gb10ColsValidity/4294967296/10/256/1/iterations:8/manual_time 19.1 ms 19.1 ms 8 bytes_per_second=431.306G/s ContiguousSplit/4Gb4ColsNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.35 ms 4.35 ms 8 bytes_per_second=474.311G/s ContiguousSplit/4Gb4ColsValidityNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.34 ms 4.35 ms 8 bytes_per_second=475.281G/s ContiguousSplit/1Gb512ColsNoValidity/1073741824/512/256/0/iterations:8/manual_time 14.6 ms 14.6 ms 8 bytes_per_second=137.131G/s ContiguousSplit/1Gb512ColsValidity/1073741824/512/256/1/iterations:8/manual_time 17.2 ms 17.2 ms 8 bytes_per_second=119.946G/s ContiguousSplit/1Gb10ColsNoValidity/1073741824/10/256/0/iterations:8/manual_time 4.89 ms 4.89 ms 8 bytes_per_second=409.281G/s ContiguousSplit/1Gb10ColsValidity/1073741824/10/256/1/iterations:8/manual_time 5.09 ms 5.10 ms 8 bytes_per_second=404.981G/s ContiguousSplit/1Gb1ColNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.40 ms 4.41 ms 8 bytes_per_second=469.011G/s ContiguousSplit/1Gb1ColValidityNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.40 ms 4.41 ms 8 bytes_per_second=468.577G/s ContiguousSplitStrings/4Gb512ColsNoValidity/4294967296/512/256/0/iterations:8/manual_time 76.0 ms 75.9 ms 8 bytes_per_second=105.396G/s ContiguousSplitStrings/4Gb512ColsValidity/4294967296/512/256/1/iterations:8/manual_time 70.6 ms 70.5 ms 8 bytes_per_second=115.205G/s ContiguousSplitStrings/4Gb10ColsNoValidity/4294967296/10/256/0/iterations:8/manual_time 28.6 ms 29.6 ms 8 bytes_per_second=293.253G/s ContiguousSplitStrings/4Gb10ColsValidity/4294967296/10/256/1/iterations:8/manual_time 19.0 ms 19.0 ms 8 bytes_per_second=448.676G/s ContiguousSplitStrings/4Gb4ColsNoSplits/1073741824/4/0/0/iterations:8/manual_time 6.69 ms 7.32 ms 8 bytes_per_second=336.342G/s ContiguousSplitStrings/4Gb4ColsValidityNoSplits/1073741824/4/0/1/iterations:8/manual_time 4.40 ms 4.39 ms 8 bytes_per_second=518.755G/s ContiguousSplitStrings/1Gb512ColsNoValidity/1073741824/512/256/0/iterations:8/manual_time 55.4 ms 55.4 ms 8 bytes_per_second=36.1167G/s ContiguousSplitStrings/1Gb512ColsValidity/1073741824/512/256/1/iterations:8/manual_time 57.0 ms 56.9 ms 8 bytes_per_second=35.6588G/s ContiguousSplitStrings/1Gb10ColsNoValidity/1073741824/10/256/0/iterations:8/manual_time 8.48 ms 8.73 ms 8 bytes_per_second=247.664G/s ContiguousSplitStrings/1Gb10ColsValidity/1073741824/10/256/1/iterations:8/manual_time 5.99 ms 6.00 ms 8 bytes_per_second=355.742G/s ContiguousSplitStrings/1Gb1ColNoSplits/1073741824/1/0/0/iterations:8/manual_time 6.69 ms 9.30 ms 8 bytes_per_second=448.359G/s ContiguousSplitStrings/1Gb1ColValidityNoSplits/1073741824/1/0/1/iterations:8/manual_time 4.33 ms 4.33 ms 8 bytes_per_second=700.639G/s ``` Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Alessandro Bellina (https://github.com/abellina) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/13342 --- cpp/include/cudf/detail/contiguous_split.hpp | 16 +++++- cpp/src/copying/contiguous_split.cu | 20 +++++--- cpp/src/copying/pack.cpp | 54 +++++++++++--------- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index 4c6d19739cf..d9a35470b7d 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -67,7 +67,7 @@ class metadata_builder { * @brief Destructor that will be implemented as default, required because metadata_builder_impl * is incomplete at this stage. */ - ~metadata_builder() = default; + ~metadata_builder(); /** * @brief Add a column to this metadata builder. @@ -105,9 +105,23 @@ class metadata_builder { */ std::vector build() const; + /** + * @brief Clear the internal buffer containing all added metadata. + */ + void clear(); + private: std::unique_ptr impl; }; +/** + * @copydoc pack_metadata + * @param builder The reusable builder object to create packed column metadata. + */ +std::vector pack_metadata(table_view const& table, + uint8_t const* contiguous_buffer, + size_t buffer_size, + metadata_builder& builder); + } // namespace detail } // namespace cudf diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4c3b4eddb8d..e7ac424001c 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1251,6 +1251,8 @@ std::vector contiguous_split(cudf::table_view const& input, std::vector cols; cols.reserve(num_root_columns); auto cur_dst_buf_info = h_dst_buf_info; + cudf::detail::metadata_builder meta_builder(num_root_columns); + for (std::size_t idx = 0; idx < num_partitions; idx++) { // traverse the buffers and build the columns. cur_dst_buf_info = build_output_columns( @@ -1258,14 +1260,18 @@ std::vector contiguous_split(cudf::table_view const& input, // pack the columns cudf::table_view t{cols}; - result.push_back(packed_table{ - t, - packed_columns{ - std::make_unique>(cudf::pack_metadata( - t, reinterpret_cast(out_buffers[idx].data()), out_buffers[idx].size())), - std::make_unique(std::move(out_buffers[idx]))}}); - cols.clear(); + + cudf::packed_columns packed_cols{ + std::make_unique>( + cudf::detail::pack_metadata(t, + reinterpret_cast(out_buffers[idx].data()), + out_buffers[idx].size(), + meta_builder)), + std::make_unique(std::move(out_buffers[idx]))}; + meta_builder.clear(); + + result.emplace_back(packed_table{std::move(t), std::move(packed_cols)}); } return result; } diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index bac9aac1886..e4de4a43b68 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -35,6 +35,8 @@ namespace { * and unpack. */ struct serialized_column { + serialized_column() = default; + serialized_column(data_type _type, size_type _size, size_type _null_count, @@ -150,24 +152,22 @@ packed_columns pack(cudf::table_view const& input, return contig_split_result.empty() ? packed_columns{} : std::move(contig_split_result[0].data); } -template -std::vector pack_metadata(ColumnIter begin, - ColumnIter end, +std::vector pack_metadata(table_view const& table, uint8_t const* contiguous_buffer, - size_t buffer_size) + size_t buffer_size, + metadata_builder& builder) { - auto mb = metadata_builder(std::distance(begin, end)); - - std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) { - build_column_metadata(mb, col, contiguous_buffer, buffer_size); - }); + std::for_each( + table.begin(), table.end(), [&builder, contiguous_buffer, buffer_size](column_view const& col) { + build_column_metadata(builder, col, contiguous_buffer, buffer_size); + }); - return mb.build(); + return builder.build(); } class metadata_builder_impl { public: - metadata_builder_impl() = default; + metadata_builder_impl(size_type const num_root_columns) { metadata.reserve(num_root_columns); } void add_column_info_to_meta(data_type const col_type, size_type const col_size, @@ -182,14 +182,16 @@ class metadata_builder_impl { std::vector build() const { - // convert to anonymous bytes - std::vector metadata_bytes; - auto const metadata_begin = reinterpret_cast(metadata.data()); - std::copy(metadata_begin, - metadata_begin + (metadata.size() * sizeof(detail::serialized_column)), - std::back_inserter(metadata_bytes)); - - return metadata_bytes; + auto output = std::vector(metadata.size() * sizeof(detail::serialized_column)); + std::memcpy(output.data(), metadata.data(), output.size()); + return output; + } + + void clear() + { + // Clear all, except the first metadata entry storing the number of top level columns that + // was added upon object construction. + metadata.resize(1); } private: @@ -228,13 +230,16 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) } metadata_builder::metadata_builder(size_type const num_root_columns) - : impl(std::make_unique()) + : impl(std::make_unique(num_root_columns + + 1 /*one more extra metadata entry as below*/)) { // first metadata entry is a stub indicating how many total (top level) columns // there are impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } +metadata_builder::~metadata_builder() = default; + void metadata_builder::add_column_info_to_meta(data_type const col_type, size_type const col_size, size_type const col_null_count, @@ -248,6 +253,8 @@ void metadata_builder::add_column_info_to_meta(data_type const col_type, std::vector metadata_builder::build() const { return impl->build(); } +void metadata_builder::clear() { return impl->clear(); } + } // namespace detail /** @@ -267,9 +274,10 @@ std::vector pack_metadata(table_view const& table, size_t buffer_size) { CUDF_FUNC_RANGE(); - return table.is_empty() - ? std::vector{} - : detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size); + if (table.is_empty()) { return std::vector{}; } + + auto builder = cudf::detail::metadata_builder(table.num_columns()); + return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder); } /**