diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3c7e10c9bc4..60d0094efac 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -586,6 +586,7 @@ add_library( src/strings/filling/fill.cu src/strings/filter_chars.cu src/strings/like.cu + src/strings/merge/merge.cu src/strings/padding.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 2c78a31f0f8..d36ecfd3a21 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp) # ################################################################################################## # * merge benchmark ------------------------------------------------------------------------------- ConfigureBench(MERGE_BENCH merge/merge.cpp) -ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp) +ConfigureNVBench( + MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp +) # ################################################################################################## # * null_mask benchmark --------------------------------------------------------------------------- diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp new file mode 100644 index 00000000000..3d0f1865490 --- /dev/null +++ b/cpp/benchmarks/merge/merge_strings.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +void nvbench_merge_strings(nvbench::state& state) +{ + auto stream = cudf::get_default_stream(); + + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + if (static_cast(2 * num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); + auto const source_tables = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)})); + auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)})); + auto const lhs = sorted_lhs->view().column(0); + auto const rhs = sorted_rhs->view().column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) + + cudf::strings_column_view(rhs).chars_size(stream); + state.add_global_memory_reads(chars_size); // all bytes are read + state.add_global_memory_writes(chars_size); // all bytes are written + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto result = cudf::merge( + {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING}); + }); +} + +NVBENCH_BENCH(nvbench_merge_strings) + .set_name("merge_strings") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh deleted file mode 100644 index 457c2b7f740..00000000000 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace cudf { -namespace strings { -namespace detail { -/** - * @brief Merges two strings columns. - * - * Caller must set the validity mask in the output column. - * - * @tparam row_order_iterator This must be an iterator for type thrust::tuple. - * - * @param lhs First column. - * @param rhs Second column. - * @param row_order Indexes for each column. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. - */ -template -std::unique_ptr merge(strings_column_view const& lhs, - strings_column_view const& rhs, - row_order_iterator begin, - row_order_iterator end, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using cudf::detail::side; - size_type strings_count = static_cast(std::distance(begin, end)); - if (strings_count == 0) return make_empty_column(type_id::STRING); - - auto lhs_column = column_device_view::create(lhs.parent(), stream); - auto d_lhs = *lhs_column; - auto rhs_column = column_device_view::create(rhs.parent(), stream); - auto d_rhs = *rhs_column; - - // caller will set the null mask - rmm::device_buffer null_mask{0, stream, mr}; - size_type null_count = lhs.null_count() + rhs.null_count(); - if (null_count > 0) - null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr); - - // build offsets column - auto offsets_transformer = - cuda::proclaim_return_type([d_lhs, d_rhs] __device__(auto index_pair) { - auto const [side, index] = index_pair; - if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0; - auto d_str = - side == side::LEFT ? d_lhs.element(index) : d_rhs.element(index); - return d_str.size_bytes(); - }); - auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); - auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().template data(); - - // create the chars column - rmm::device_uvector chars(bytes, stream, mr); - auto d_chars = chars.data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) { - auto const [side, index] = begin[idx]; - if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return; - auto d_str = side == side::LEFT ? d_lhs.element(index) - : d_rhs.element(index); - memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); - }); - - return make_strings_column( - strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp new file mode 100644 index 00000000000..35fd9c0593d --- /dev/null +++ b/cpp/include/cudf/strings/detail/merge.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace cudf ::strings ::detail { +/** + * @brief Merges two strings columns + * + * @param lhs First column + * @param rhs Second column + * @param row_order Indices for each column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +std::unique_ptr merge(strings_column_view const& lhs, + strings_column_view const& rhs, + cudf::detail::index_vector const& row_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::strings::detail diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 7136df325f4..35812c0573d 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -164,22 +164,22 @@ std::pair, int64_t> make_offsets_child_column( }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_elements = + auto const total_bytes = cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); - // TODO: replace exception with if-statement when enabling creating INT64 offsets - CUDF_EXPECTS(total_elements <= size_type_max, - "Size of output exceeds the character size limit", + auto const threshold = get_offset64_threshold(); + CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold), + "Size of output exceeds the column size limit", std::overflow_error); - // if (total_elements >= get_offset64_threshold()) { - // // recompute as int64 offsets when above the threshold - // offsets_column = make_numeric_column( - // data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - // auto d_offsets64 = offsets_column->mutable_view().template data(); - // sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); - // } - - return std::pair(std::move(offsets_column), total_elements); + if (total_bytes >= get_offset64_threshold()) { + // recompute as int64 offsets when above the threshold + offsets_column = make_numeric_column( + data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets64 = offsets_column->mutable_view().template data(); + cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); + } + + return std::pair(std::move(offsets_column), total_bytes); } } // namespace detail diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 079b6a73e0b..a3221038eed 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -86,9 +86,10 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, return (item.first != nullptr ? static_cast(item.second) : size_type{0}); }); auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); - auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column( + auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto offsets_view = offsets_column->view(); + auto const d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // create null mask auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; @@ -98,11 +99,10 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] { + auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] { auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); // use a character-parallel kernel for long string lengths if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view); auto const str_begin = thrust::make_transform_iterator( begin, cuda::proclaim_return_type([] __device__(auto ip) { return string_view{ip.first, ip.second}; @@ -121,12 +121,11 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto d_chars = chars_data.data(); auto copy_chars = [d_chars] __device__(auto item) { string_index_pair const str = thrust::get<0>(item); - size_type const offset = thrust::get<1>(item); + int64_t const offset = thrust::get<1>(item); if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); }; thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_tuple(begin, offsets_view.template begin())), + thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), strings_count, copy_chars); return chars_data; @@ -168,21 +167,15 @@ std::unique_ptr make_strings_column(CharIterator chars_begin, { CUDF_FUNC_RANGE(); size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1; - size_type bytes = std::distance(chars_begin, chars_end) * sizeof(char); - if (strings_count == 0) return make_empty_column(type_id::STRING); + if (strings_count == 0) { return make_empty_column(type_id::STRING); } + int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char); CUDF_EXPECTS(bytes >= 0, "invalid offsets data"); // build offsets column -- this is the number of strings + 1 - auto offsets_column = make_numeric_column( - data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - thrust::transform(rmm::exec_policy(stream), - offsets_begin, - offsets_end, - offsets_view.data(), - cuda::proclaim_return_type( - [] __device__(auto offset) { return static_cast(offset); })); + auto [offsets_column, computed_bytes] = + cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr); + CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count"); // build chars column rmm::device_uvector chars_data(bytes, stream, mr); diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 4463b16df78..5a3be259ed9 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -434,18 +434,8 @@ std::unique_ptr column_merger::operator()( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - auto column = strings::detail::merge(strings_column_view(lcol), - strings_column_view(rcol), - row_order_.begin(), - row_order_.end(), - stream, - mr); - if (lcol.has_nulls() || rcol.has_nulls()) { - auto merged_view = column->mutable_view(); - materialize_bitmask( - lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream); - } - return column; + return strings::detail::merge( + strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr); } // specialization for dictionary diff --git a/cpp/src/strings/merge/merge.cu b/cpp/src/strings/merge/merge.cu new file mode 100644 index 00000000000..28e171f157e --- /dev/null +++ b/cpp/src/strings/merge/merge.cu @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +std::unique_ptr merge(strings_column_view const& lhs, + strings_column_view const& rhs, + cudf::detail::index_vector const& row_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using cudf::detail::side; + if (row_order.is_empty()) { return make_empty_column(type_id::STRING); } + auto const strings_count = static_cast(row_order.size()); + + auto const lhs_column = column_device_view::create(lhs.parent(), stream); + auto const d_lhs = *lhs_column; + auto const rhs_column = column_device_view::create(rhs.parent(), stream); + auto const d_rhs = *rhs_column; + + auto const begin = row_order.begin(); + + // build vector of strings + rmm::device_uvector indices(strings_count, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + indices.begin(), + [d_lhs, d_rhs, begin] __device__(size_type idx) { + auto const [s, index] = begin[idx]; + if (s == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) { + return string_index_pair{nullptr, 0}; + } + auto d_str = (s == side::LEFT) ? d_lhs.element(index) + : d_rhs.element(index); + return d_str.size_bytes() == 0 + ? string_index_pair{"", 0} // ensures empty != null + : string_index_pair{d_str.data(), d_str.size_bytes()}; + }); + + // convert vector into strings column + return make_strings_column(indices.begin(), indices.end(), stream, mr); +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index 28179a7341c..d7368d31944 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -411,3 +411,60 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3); } + +class MergeLargeStringsTest : public cudf::test::BaseFixture {}; + +TEST_F(MergeLargeStringsTest, MergeLargeStrings) +{ + CUDF_TEST_ENABLE_LARGE_STRINGS(); + auto itr = thrust::constant_iterator( + "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes + auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB + auto input_views = std::vector(); + auto const view = cudf::table_view({input}); + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_views.push_back(view); + splits.push_back(view.num_rows() * (i + 1)); + } + splits.pop_back(); // remove last entry + auto const column_order = std::vector{cudf::order::ASCENDING}; + auto const null_precedence = std::vector{cudf::null_order::AFTER}; + + auto result = cudf::merge(input_views, {0}, column_order, null_precedence); + auto sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + auto sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also test with large strings column as input + input_views.clear(); + input_views.push_back(view); // regular column + input_views.push_back(result->view()); // large column + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1)); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(view.num_rows() * multiplier); + sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also check merge still returns 32-bit offsets for regular columns + input_views.clear(); + input_views.push_back(view); + input_views.push_back(view); + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * 2); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32}); + sliced = cudf::split(sv.parent(), {view.num_rows()}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input); +}