From e446371bf16373590043474df5c023e610ff012c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 24 Sep 2024 16:28:20 -0400 Subject: [PATCH 01/23] Add cudf::strings::contains_multiple --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/string/find.cpp | 25 +- cpp/include/cudf/strings/find.hpp | 33 ++ cpp/src/strings/search/contains_multiple.cu | 330 ++++++++++++++++++++ cpp/tests/strings/find_tests.cpp | 153 ++++++++- 5 files changed, 538 insertions(+), 4 deletions(-) create mode 100644 cpp/src/strings/search/contains_multiple.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 84b462bb884..38ee5389cf2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -603,6 +603,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index a9c620e4bf0..baa34b13e2e 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -73,6 +73,28 @@ static void bench_find_string(nvbench::state& state) } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + } else if (api == "contains_multi") { + constexpr int iters = 10; + std::vector match_targets({" abc", + "W43", + "0987 5W43", + "123 abc", + "23 abc", + "3 abc", + "é", + "7 5W43", + "87 5W43", + "987 5W43"}); + auto multi_targets = std::vector{}; + for (int i = 0; i < iters; i++) { + multi_targets.emplace_back(match_targets[i % match_targets.size()]); + } + cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(), + multi_targets.end()); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column)); + }); } else if (api == "starts_with") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); @@ -84,7 +106,8 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", + {"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index e024b116a71..497c263cca8 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -163,6 +163,39 @@ std::unique_ptr contains( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns a table of columns of boolean values for each string where true indicates + * the target string was found within that string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null string entries return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column + */ +std::unique_ptr contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a column of boolean values for each string where true indicates * the target string was found at the beginning of that string in the provided column. diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..e4673016e28 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + cudf::device_span d_results) +{ + auto const num_targets = d_targets.size(); + auto const num_rows = d_strings.size(); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= num_rows) { return; } + if (d_strings.is_null(str_idx)) { return; } + // get the string for this warp + auto const d_str = d_strings.element(str_idx); + + // size of shared_bools = targets_size * block_size + // each thread uses targets_size bools + extern __shared__ bool shared_bools[]; + auto const lane_idx = idx % cudf::detail::warp_size; + + // initialize result: set true if target is empty, false otherwise + for (int target_idx = 0; target_idx < num_targets; target_idx++) { + auto const d_target = d_targets.element(target_idx); + shared_bools[threadIdx.x * num_targets + target_idx] = d_target.empty(); + } + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += cudf::detail::warp_size) { + // search for byte in first_bytes array + auto const chr = static_cast(*(d_str.data() + str_byte_idx)); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = + (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + // check for targets that begin with chr + while ((map_idx < num_targets) && (offset_idx < last_idx)) { + auto target_idx = d_indices[map_idx++]; + int temp_result_idx = threadIdx.x * num_targets + target_idx; + if (!shared_bools[temp_result_idx]) { // not found before + auto const d_target = d_targets.element(target_idx); + if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + // first char already checked, only need to check the [2nd, end) chars if has. + bool found = true; + for (auto i = 1; i < d_target.size_bytes() && found; i++) { + if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } + } + if (found) { shared_bools[temp_result_idx] = true; } + } + } + ++offset_idx; + } + } + + // wait all lanes are done in a warp + __syncwarp(); + + if (lane_idx == 0) { + for (int target_idx = 0; target_idx < num_targets; target_idx++) { + bool found = false; + for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) { + int temp_idx = (threadIdx.x + lane_idx) * num_targets + target_idx; + if (shared_bools[temp_idx]) { + found = true; + break; + } + } + d_results[target_idx][str_idx] = found; + } + } +} + +CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + cudf::device_span d_results) +{ + auto const str_idx = static_cast(cudf::detail::grid_1d::global_thread_id()); + auto const num_targets = d_targets.size(); + auto const num_rows = d_strings.size(); + if (str_idx >= num_rows) { return; } + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + + // initialize output; the result of searching empty target is true + for (auto target_idx = 0; target_idx < num_targets; ++target_idx) { + auto const d_target = d_targets.element(target_idx); + d_results[target_idx][str_idx] = d_target.empty(); + } + + // process each byte of the current string + auto const last_ptr = d_first_bytes + unique_count; + for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) { + // search for byte in first_bytes array + auto const chr = static_cast(*(d_str.data() + str_byte_idx)); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = + (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + // check for targets that begin with chr + while ((map_idx < num_targets) && (offset_idx < last_idx)) { + auto target_idx = d_indices[map_idx++]; + if (!d_results[target_idx][str_idx]) { // not found before + auto const d_target = d_targets.element(target_idx); + if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + // first char already checked, only need to check the [2nd, end) chars + bool found = true; + for (auto i = 1; i < d_target.size_bytes() && found; i++) { + if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } + } + if (found) { d_results[target_idx][str_idx] = true; } + } + } + ++offset_idx; + } + } +} + +std::vector> multi_contains(bool warp_parallel, + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_targets = static_cast(targets.size()); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort the first bytes + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), [] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + }); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + } + + // remove duplicates to speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto ucount = static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results_list = + std::vector>(results_iter, results_iter + targets.size()); + auto device_results_list = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + results_list.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + if (warp_parallel) { + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + int shared_mem_size = block_size * targets.size(); + multi_contains_warp_parallel<<>>( + *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + } else { + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_row_parallel<<>>( + *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + } + + return results_list; +} + +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); + + if ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)) { + // Small strings. Searching for multiple targets in one thread seems to work fastest. + return std::make_unique
( + multi_contains(/**warp parallel**/ false, input, targets, stream, mr)); + } + + // Long strings + // Use warp parallel when the average string width is greater than the threshold + static constexpr size_type target_group_size = 16; // perhaps can be calculated + if (targets.size() <= target_group_size) { + return std::make_unique
( + multi_contains(/**warp parallel**/ true, input, targets, stream, mr)); + } + + // Too many targets will consume more shared memory, so split targets + // TODO: test with large working memory (instead of shared-memory) + std::vector> ret_columns; + auto const num_groups = cudf::util::div_rounding_up_safe(targets.size(), target_group_size); + for (size_type group_idx = 0; group_idx < num_groups; group_idx++) { + auto const start_target = group_idx * target_group_size; + auto const end_target = std::min(start_target + target_group_size, targets.size()); + + auto target_group = cudf::detail::slice(targets.parent(), start_target, end_target, stream); + auto bool_columns = multi_contains( + /**warp parallel**/ true, input, strings_column_view(target_group), stream, mr); + for (auto& c : bool_columns) { + ret_columns.push_back(std::move(c)); // transfer ownership + } + } + return std::make_unique
(std::move(ret_columns)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..698fea50bad 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,15 @@ #include #include #include +#include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; @@ -198,6 +197,154 @@ TEST_F(StringsFindTest, ContainsLongStrings) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsFindTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + TEST_F(StringsFindTest, StartsWith) { cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""}, From 779ca646fe7c43207d935ed96d59f133def85cad Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 24 Sep 2024 16:55:04 -0400 Subject: [PATCH 02/23] add proclaim_return_type --- cpp/src/strings/search/contains_multiple.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index e4673016e28..c8204ff5f49 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -202,9 +203,10 @@ std::vector> multi_contains(bool warp_parallel, auto indices = rmm::device_uvector(targets.size(), stream); { auto tgt_itr = thrust::make_transform_iterator( - d_targets->begin(), [] __device__(auto const& d_tgt) -> u_char { + d_targets->begin(), + cuda::proclaim_return_type([] __device__(auto const& d_tgt) -> u_char { return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); - }); + })); auto count_itr = thrust::make_counting_iterator(0); auto keys_out = first_bytes.begin(); auto vals_out = indices.begin(); From f19cb1c683facfc92a8f8dcd4ba267661de27bca Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 24 Sep 2024 17:41:46 -0400 Subject: [PATCH 03/23] fix while-loop check --- cpp/src/strings/search/contains_multiple.cu | 32 +++++++++------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index c8204ff5f49..ae05f75389f 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -65,10 +65,9 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings cudf::device_span d_results) { auto const num_targets = d_targets.size(); - auto const num_rows = d_strings.size(); auto const idx = cudf::detail::grid_1d::global_thread_id(); auto const str_idx = idx / cudf::detail::warp_size; - if (str_idx >= num_rows) { return; } + if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); @@ -93,14 +92,13 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // if not found, continue to next byte if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } // compute index of matched byte - auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); - auto map_idx = d_offsets[offset_idx]; - auto const last_idx = - (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; // check for targets that begin with chr - while ((map_idx < num_targets) && (offset_idx < last_idx)) { - auto target_idx = d_indices[map_idx++]; - int temp_result_idx = threadIdx.x * num_targets + target_idx; + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; + auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx; if (!shared_bools[temp_result_idx]) { // not found before auto const d_target = d_targets.element(target_idx); if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { @@ -112,7 +110,6 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings if (found) { shared_bools[temp_result_idx] = true; } } } - ++offset_idx; } } @@ -144,8 +141,7 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, { auto const str_idx = static_cast(cudf::detail::grid_1d::global_thread_id()); auto const num_targets = d_targets.size(); - auto const num_rows = d_strings.size(); - if (str_idx >= num_rows) { return; } + if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); @@ -164,13 +160,12 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, // if not found, continue to next byte if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } // compute index of matched byte - auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); - auto map_idx = d_offsets[offset_idx]; - auto const last_idx = - (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; // check for targets that begin with chr - while ((map_idx < num_targets) && (offset_idx < last_idx)) { - auto target_idx = d_indices[map_idx++]; + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; if (!d_results[target_idx][str_idx]) { // not found before auto const d_target = d_targets.element(target_idx); if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { @@ -182,7 +177,6 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, if (found) { d_results[target_idx][str_idx] = true; } } } - ++offset_idx; } } } From 65e21d3f05c48d87e85711e884fe12630fd008d7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Sep 2024 11:54:05 -0400 Subject: [PATCH 04/23] cleanup code --- cpp/src/strings/search/contains_multiple.cu | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index ae05f75389f..9e6ba688c2f 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -101,7 +101,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx; if (!shared_bools[temp_result_idx]) { // not found before auto const d_target = d_targets.element(target_idx); - if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { // first char already checked, only need to check the [2nd, end) chars if has. bool found = true; for (auto i = 1; i < d_target.size_bytes() && found; i++) { @@ -119,12 +119,10 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings if (lane_idx == 0) { for (int target_idx = 0; target_idx < num_targets; target_idx++) { bool found = false; - for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) { - int temp_idx = (threadIdx.x + lane_idx) * num_targets + target_idx; - if (shared_bools[temp_idx]) { - found = true; - break; - } + // use thrust::any() algorithm with strided iterator? + for (size_type lidx = 0; lidx < cudf::detail::warp_size && !found; lidx++) { + auto const temp_idx = ((threadIdx.x + lidx) * num_targets) + target_idx; + if (shared_bools[temp_idx]) { found = true; } } d_results[target_idx][str_idx] = found; } @@ -168,7 +166,7 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, auto const target_idx = d_indices[map_idx++]; if (!d_results[target_idx][str_idx]) { // not found before auto const d_target = d_targets.element(target_idx); - if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { // first char already checked, only need to check the [2nd, end) chars bool found = true; for (auto i = 1; i < d_target.size_bytes() && found; i++) { From b9da939a5dff69adcf1e35ee992e0d15134e538d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Sep 2024 17:21:01 -0400 Subject: [PATCH 05/23] change shared memory layout --- cpp/src/strings/search/contains_multiple.cu | 48 +++++++++++---------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 9e6ba688c2f..f154fb0a89c 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -72,15 +73,21 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // get the string for this warp auto const d_str = d_strings.element(str_idx); - // size of shared_bools = targets_size * block_size - // each thread uses targets_size bools + // size of shared_bools = num_targets * block_size + // each thread uses num_targets bools extern __shared__ bool shared_bools[]; auto const lane_idx = idx % cudf::detail::warp_size; + auto const warp_idx = threadIdx.x / cudf::detail::warp_size; + // bools for the current string + auto bools = shared_bools + (warp_idx * cudf::detail::warp_size * num_targets); // initialize result: set true if target is empty, false otherwise - for (int target_idx = 0; target_idx < num_targets; target_idx++) { + for (auto target_idx = lane_idx; target_idx < num_targets; + target_idx += cudf::detail::warp_size) { auto const d_target = d_targets.element(target_idx); - shared_bools[threadIdx.x * num_targets + target_idx] = d_target.empty(); + auto const begin = bools + (target_idx * cudf::detail::warp_size); + thrust::uninitialized_fill( + thrust::seq, begin, begin + cudf::detail::warp_size, d_target.empty()); } auto const last_ptr = d_first_bytes + unique_count; @@ -92,14 +99,14 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // if not found, continue to next byte if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } // compute index of matched byte - auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); - auto map_idx = d_offsets[offset_idx]; + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; // check for targets that begin with chr while (map_idx < last_idx) { - auto const target_idx = d_indices[map_idx++]; - auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx; - if (!shared_bools[temp_result_idx]) { // not found before + auto const target_idx = d_indices[map_idx++]; + auto const bool_idx = (target_idx * cudf::detail::warp_size) + lane_idx; + if (!bools[bool_idx]) { // not found before auto const d_target = d_targets.element(target_idx); if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { // first char already checked, only need to check the [2nd, end) chars if has. @@ -107,7 +114,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings for (auto i = 1; i < d_target.size_bytes() && found; i++) { if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } } - if (found) { shared_bools[temp_result_idx] = true; } + if (found) { bools[bool_idx] = true; } } } } @@ -116,16 +123,13 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // wait all lanes are done in a warp __syncwarp(); - if (lane_idx == 0) { - for (int target_idx = 0; target_idx < num_targets; target_idx++) { - bool found = false; - // use thrust::any() algorithm with strided iterator? - for (size_type lidx = 0; lidx < cudf::detail::warp_size && !found; lidx++) { - auto const temp_idx = ((threadIdx.x + lidx) * num_targets) + target_idx; - if (shared_bools[temp_idx]) { found = true; } - } - d_results[target_idx][str_idx] = found; - } + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; + target_idx += cudf::detail::warp_size) { + auto begin = bools + (target_idx * cudf::detail::warp_size); + auto found = + thrust::any_of(thrust::seq, begin, begin + cudf::detail::warp_size, thrust::identity{}); + d_results[target_idx][str_idx] = found; } } @@ -158,8 +162,8 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, // if not found, continue to next byte if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } // compute index of matched byte - auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); - auto map_idx = d_offsets[offset_idx]; + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; // check for targets that begin with chr while (map_idx < last_idx) { From e8cb6cf8eb0441564e6f1c191535a3709c7fb74b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Sep 2024 17:52:03 -0400 Subject: [PATCH 06/23] use global memory if shared memory limit is reached --- cpp/src/strings/search/contains_multiple.cu | 104 ++++++++------------ 1 file changed, 42 insertions(+), 62 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index f154fb0a89c..a88c9848c24 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -63,6 +63,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings size_type const* d_indices, size_type const* d_offsets, size_type unique_count, + bool* working_memory, cudf::device_span d_results) { auto const num_targets = d_targets.size(); @@ -73,13 +74,16 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // get the string for this warp auto const d_str = d_strings.element(str_idx); + auto const lane_idx = idx % cudf::detail::warp_size; + // size of shared_bools = num_targets * block_size // each thread uses num_targets bools extern __shared__ bool shared_bools[]; - auto const lane_idx = idx % cudf::detail::warp_size; auto const warp_idx = threadIdx.x / cudf::detail::warp_size; // bools for the current string - auto bools = shared_bools + (warp_idx * cudf::detail::warp_size * num_targets); + auto bools = working_memory == nullptr + ? (shared_bools + (warp_idx * cudf::detail::warp_size * num_targets)) + : (working_memory + (str_idx * cudf::detail::warp_size * num_targets)); // initialize result: set true if target is empty, false otherwise for (auto target_idx = lane_idx; target_idx < num_targets; @@ -183,18 +187,22 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, } } -std::vector> multi_contains(bool warp_parallel, - strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto const num_targets = static_cast(targets.size()); + CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); + + auto const num_targets = targets.size(); auto const d_strings = column_device_view::create(input.parent(), stream); auto const d_targets = column_device_view::create(targets.parent(), stream); - // copy the first byte of each target and sort the first bytes + // copy the first byte of each target and sort them auto first_bytes = rmm::device_uvector(targets.size(), stream); auto indices = rmm::device_uvector(targets.size(), stream); { @@ -251,66 +259,38 @@ std::vector> multi_contains(bool warp_parallel, auto d_indices = indices.data(); auto d_offsets = offsets.data(); - if (warp_parallel) { + // Smaller strings perform better with a row per string + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + if (row_parallel) { + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_row_parallel<<>>( + *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + } else { cudf::detail::grid_1d grid{ static_cast(input.size()) * cudf::detail::warp_size, block_size}; - int shared_mem_size = block_size * targets.size(); + auto shared_mem_size = block_size * targets.size(); + size_type work_mem_size = 0; + if (shared_mem_size > 12000) { // TODO: Need to find a good value for this + shared_mem_size = 0; + work_mem_size = targets.size() * input.size() * cudf::detail::warp_size; + } + auto working_memory = rmm::device_uvector(work_mem_size, stream); multi_contains_warp_parallel<<>>( - *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); - } else { - cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; - multi_contains_row_parallel<<>>( - *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); - } - - return results_list; -} - -} // namespace - -std::unique_ptr
contains_multiple(strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); - CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); - - if ((input.null_count() == input.size()) || - ((input.chars_size(stream) / (input.size() - input.null_count())) <= - AVG_CHAR_BYTES_THRESHOLD)) { - // Small strings. Searching for multiple targets in one thread seems to work fastest. - return std::make_unique
( - multi_contains(/**warp parallel**/ false, input, targets, stream, mr)); + stream.value()>>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + ucount, + working_memory.data(), + device_results_list); } - // Long strings - // Use warp parallel when the average string width is greater than the threshold - static constexpr size_type target_group_size = 16; // perhaps can be calculated - if (targets.size() <= target_group_size) { - return std::make_unique
( - multi_contains(/**warp parallel**/ true, input, targets, stream, mr)); - } - - // Too many targets will consume more shared memory, so split targets - // TODO: test with large working memory (instead of shared-memory) - std::vector> ret_columns; - auto const num_groups = cudf::util::div_rounding_up_safe(targets.size(), target_group_size); - for (size_type group_idx = 0; group_idx < num_groups; group_idx++) { - auto const start_target = group_idx * target_group_size; - auto const end_target = std::min(start_target + target_group_size, targets.size()); - - auto target_group = cudf::detail::slice(targets.parent(), start_target, end_target, stream); - auto bool_columns = multi_contains( - /**warp parallel**/ true, input, strings_column_view(target_group), stream, mr); - for (auto& c : bool_columns) { - ret_columns.push_back(std::move(c)); // transfer ownership - } - } - return std::make_unique
(std::move(ret_columns)); + return std::make_unique
(std::move(results_list)); } } // namespace detail From 5cc9d54e0b71336740e72f5fa3d488cd42e47f91 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Sep 2024 18:05:05 -0400 Subject: [PATCH 07/23] change shared-memory threshold --- cpp/src/strings/search/contains_multiple.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index a88c9848c24..52a046e7cf0 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -272,7 +272,7 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, static_cast(input.size()) * cudf::detail::warp_size, block_size}; auto shared_mem_size = block_size * targets.size(); size_type work_mem_size = 0; - if (shared_mem_size > 12000) { // TODO: Need to find a good value for this + if (shared_mem_size > (16 * block_size)) { // TODO: Need to find a good value for this shared_mem_size = 0; work_mem_size = targets.size() * input.size() * cudf::detail::warp_size; } From 4acb531c655d1c35285493f4edb465dcc2a41a32 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 27 Sep 2024 08:19:50 -0400 Subject: [PATCH 08/23] factor out benchmarks for find/contains_multiple --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/find.cpp | 38 +-------- cpp/benchmarks/string/find_multiple.cpp | 87 +++++++++++++++++++++ cpp/include/cudf/strings/find.hpp | 33 -------- cpp/include/cudf/strings/find_multiple.hpp | 33 ++++++++ cpp/src/strings/search/contains_multiple.cu | 2 +- cpp/tests/strings/find_tests.cpp | 1 + 7 files changed, 126 insertions(+), 69 deletions(-) create mode 100644 cpp/benchmarks/string/find_multiple.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..c8da31544fc 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -373,6 +373,7 @@ ConfigureNVBench( string/count.cpp string/extract.cpp string/find.cpp + string/find_multiple.cpp string/gather.cpp string/join_strings.cpp string/lengths.cpp diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index baa34b13e2e..2d5b6963a75 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -19,11 +19,8 @@ #include -#include #include -#include #include -#include #include #include @@ -49,15 +46,13 @@ static void bench_find_string(nvbench::state& state) auto const col = build_input_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - std::vector h_targets({"5W", "5W43", "0987 5W43"}); - cudf::string_scalar target(h_targets[2]); - cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + cudf::string_scalar target("0987 5W43"); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") { + if (api == "find") { state.add_global_memory_writes(input.size()); } else { state.add_global_memory_writes(input.size()); @@ -66,35 +61,9 @@ static void bench_find_string(nvbench::state& state) if (api == "find") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); - } else if (api == "find_multi") { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - }); } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); - } else if (api == "contains_multi") { - constexpr int iters = 10; - std::vector match_targets({" abc", - "W43", - "0987 5W43", - "123 abc", - "23 abc", - "3 abc", - "é", - "7 5W43", - "87 5W43", - "987 5W43"}); - auto multi_targets = std::vector{}; - for (int i = 0; i < iters; i++) { - multi_targets.emplace_back(match_targets[i % match_targets.size()]); - } - cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(), - multi_targets.end()); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column)); - }); } else if (api == "starts_with") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); @@ -106,8 +75,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", - {"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"}) + .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp new file mode 100644 index 00000000000..8d47a4b03db --- /dev/null +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +std::unique_ptr build_input_column(cudf::size_type n_rows, + cudf::size_type row_width, + int32_t hit_rate); + +static void bench_find_string(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const target_count = static_cast(state.get_int64("targets")); + auto const api = state.get_string("api"); + + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto const stream = cudf::get_default_stream(); + auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); + + // Note that these all match the first row of the raw_data in build_input_column. + // This is so the hit_rate can properly accounted for. + std::vector target_data( + {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); + auto h_targets = std::vector{}; + for (cudf::size_type i = 0; i < target_count; i++) { + h_targets.emplace_back(target_data[i % target_data.size()]); + } + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + if (api == "find") { + state.add_global_memory_writes(input.size()); + } else { + state.add_global_memory_writes(input.size()); + } + + if (api == "find") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(targets)); + }); + } +} + +NVBENCH_BENCH(bench_find_string) + .set_name("find_multiple") + .add_string_axis("api", {"find", "contains"}) + .add_int64_axis("targets", {10, 20}) + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 497c263cca8..e024b116a71 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -163,39 +163,6 @@ std::unique_ptr contains( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); -/** - * @brief Returns a table of columns of boolean values for each string where true indicates - * the target string was found within that string in the provided column - * - * Each column in the result table corresponds to the result for the target string at the same - * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, - * etc. - * - * If the target is not found for a string, false is returned for that entry in the output column. - * If the target is an empty string, true is returned for all non-null entries in the output column. - * - * Any null string entries return corresponding null entries in the output columns. - * - * @code{.pseudo} - * input = ["a", "b", "c"] - * targets = ["a", "c"] - * output is a table with two boolean columns: - * column 0: [true, false, false] - * column 1: [false, false, true] - * @endcode - * - * @param input Strings instance for this operation - * @param targets UTF-8 encoded strings to search for in each string in `input` - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New BOOL8 column - */ -std::unique_ptr
contains_multiple( - strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns a column of boolean values for each string where true indicates * the target string was found at the beginning of that string in the provided column. diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 1fe446db8da..90ffd6e7df9 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -27,6 +27,39 @@ namespace strings { * @file */ +/** + * @brief Returns a table of columns of boolean values for each string where true indicates + * the target string was found within that string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null string entries return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column + */ +std::unique_ptr
contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a lists column with character position values where each * of the target strings are found in each string. diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 52a046e7cf0..1921fc8e0ea 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 698fea50bad..19714a5afd0 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include From f3a3f249e9955473d83e667bb0ba902d78ac6608 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 27 Sep 2024 11:11:51 -0400 Subject: [PATCH 09/23] refactor kernels into templated one --- cpp/src/strings/search/contains_multiple.cu | 246 ++++++++++---------- 1 file changed, 119 insertions(+), 127 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 1921fc8e0ea..e72cde6ec04 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -15,7 +15,6 @@ */ #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +42,7 @@ #include #include -#include +#include namespace cudf { namespace strings { @@ -57,48 +57,87 @@ namespace { */ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; -CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings, - column_device_view const d_targets, - u_char const* d_first_bytes, - size_type const* d_indices, - size_type const* d_offsets, - size_type unique_count, - bool* working_memory, - cudf::device_span d_results) +/** + * @brief Kernel for finding multiple targets in each row of input strings + * + * The d_first_bytes is sorted and unique so the d_indices and d_offsets + * are used to map the corresponding character to its d_targets entry. + * + * Example + * d_targets = ["foo", "hello", "world", "hi"] + * - sorted first-chars: ['f','h','h','w'] + * d_indices = [0, 3, 1, 2] + * d_first_bytes = ['f', 'h', 'w'] (unique) + * d_offsets = [0, 1, 3] + * unique_count = 3 + * + * If 'h' is found, lower_bound produces pos=1 in d_first_bytes. + * This corresponds to d_offset[1]==1 which has two values: + * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2. + * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential + * in the d_indices array: + * - tgt1_idx = d_indices[map_idx] = 3 --> d_targets[3] == 'hi' + * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' + * The logic now only needs to check for either of these 2 targets. + * + * This kernel works in either row-per-string or warp-per-string depending + * on the template parameter. If tile_size==1, then it executes as a + * row-per-string. If tile_size=32, the it executes as a warp-per-string. + * No other options are supported for now. + * + * @tparam tile_size Number of threads per string + * @param d_strings Input strings + * @param d_targets Target strings to search within input strings + * @param d_first_bytes Sorted, unique list of first bytes of the target strings + * @param d_indices Indices to map sorted d_first_bytes to d_targets + * @param d_offsets Offsets to map d_indices to d_targets + * @param unique_count Number of unique values in d_first_bytes (and d_offsets) + * @param working_memory Global memory to use if shared-memory is too small + * @param d_results Bool results for each target within each string row + */ +template +CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + bool* working_memory, + cudf::device_span d_results) { - auto const num_targets = d_targets.size(); - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { return; } - // get the string for this warp + + // get the string for this tile auto const d_str = d_strings.element(str_idx); - auto const lane_idx = idx % cudf::detail::warp_size; + auto const lane_idx = idx % tile_size; + auto const tile_idx = static_cast(threadIdx.x) / tile_size; + auto const num_targets = d_targets.size(); // size of shared_bools = num_targets * block_size // each thread uses num_targets bools extern __shared__ bool shared_bools[]; - auto const warp_idx = threadIdx.x / cudf::detail::warp_size; // bools for the current string - auto bools = working_memory == nullptr - ? (shared_bools + (warp_idx * cudf::detail::warp_size * num_targets)) - : (working_memory + (str_idx * cudf::detail::warp_size * num_targets)); + auto bools = working_memory == nullptr ? (shared_bools + (tile_idx * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); // initialize result: set true if target is empty, false otherwise - for (auto target_idx = lane_idx; target_idx < num_targets; - target_idx += cudf::detail::warp_size) { + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { auto const d_target = d_targets.element(target_idx); - auto const begin = bools + (target_idx * cudf::detail::warp_size); - thrust::uninitialized_fill( - thrust::seq, begin, begin + cudf::detail::warp_size, d_target.empty()); + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); } + if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); } auto const last_ptr = d_first_bytes + unique_count; for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); - str_byte_idx += cudf::detail::warp_size) { + str_byte_idx += tile_size) { // search for byte in first_bytes array - auto const chr = static_cast(*(d_str.data() + str_byte_idx)); + auto const sptr = d_str.data() + str_byte_idx; + auto const chr = static_cast(*sptr); auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); // if not found, continue to next byte if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } @@ -109,84 +148,28 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings // check for targets that begin with chr while (map_idx < last_idx) { auto const target_idx = d_indices[map_idx++]; - auto const bool_idx = (target_idx * cudf::detail::warp_size) + lane_idx; + auto const bool_idx = (target_idx * tile_size) + lane_idx; if (!bools[bool_idx]) { // not found before auto const d_target = d_targets.element(target_idx); if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { - // first char already checked, only need to check the [2nd, end) chars if has. - bool found = true; - for (auto i = 1; i < d_target.size_bytes() && found; i++) { - if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } + // first char already checked, so just check the [1, end) chars match + auto const tp = d_target.data(); + if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { + bools[bool_idx] = true; } - if (found) { bools[bool_idx] = true; } } } } } - - // wait all lanes are done in a warp - __syncwarp(); + if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); } // reduce the bools for each target to store in the result - for (auto target_idx = lane_idx; target_idx < num_targets; - target_idx += cudf::detail::warp_size) { - auto begin = bools + (target_idx * cudf::detail::warp_size); - auto found = - thrust::any_of(thrust::seq, begin, begin + cudf::detail::warp_size, thrust::identity{}); - d_results[target_idx][str_idx] = found; - } -} - -CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, - column_device_view const d_targets, - u_char const* d_first_bytes, - size_type const* d_indices, - size_type const* d_offsets, - size_type unique_count, - cudf::device_span d_results) -{ - auto const str_idx = static_cast(cudf::detail::grid_1d::global_thread_id()); - auto const num_targets = d_targets.size(); - if (str_idx >= d_strings.size()) { return; } - if (d_strings.is_null(str_idx)) { return; } - auto const d_str = d_strings.element(str_idx); - - // initialize output; the result of searching empty target is true - for (auto target_idx = 0; target_idx < num_targets; ++target_idx) { - auto const d_target = d_targets.element(target_idx); - d_results[target_idx][str_idx] = d_target.empty(); - } - - // process each byte of the current string - auto const last_ptr = d_first_bytes + unique_count; - for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) { - // search for byte in first_bytes array - auto const chr = static_cast(*(d_str.data() + str_byte_idx)); - auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); - // if not found, continue to next byte - if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } - // compute index of matched byte - auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); - auto map_idx = d_offsets[offset_idx]; - auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; - // check for targets that begin with chr - while (map_idx < last_idx) { - auto const target_idx = d_indices[map_idx++]; - if (!d_results[target_idx][str_idx]) { // not found before - auto const d_target = d_targets.element(target_idx); - if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { - // first char already checked, only need to check the [2nd, end) chars - bool found = true; - for (auto i = 1; i < d_target.size_bytes() && found; i++) { - if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } - } - if (found) { d_results[target_idx][str_idx] = true; } - } - } - } + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); } } - } // namespace std::unique_ptr
contains_multiple(strings_column_view const& input, @@ -197,8 +180,6 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); - auto const num_targets = targets.size(); - auto const d_strings = column_device_view::create(input.parent(), stream); auto const d_targets = column_device_view::create(targets.parent(), stream); @@ -214,23 +195,25 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, auto count_itr = thrust::make_counting_iterator(0); auto keys_out = first_bytes.begin(); auto vals_out = indices.begin(); + auto num_items = targets.size(); auto cmp_op = thrust::less(); auto sv = stream.value(); std::size_t tmp_bytes = 0; cub::DeviceMergeSort::SortPairsCopy( - nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); cub::DeviceMergeSort::SortPairsCopy( - tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); } - // remove duplicates to speed up lower_bound + // remove duplicates to help speed up lower_bound auto offsets = rmm::device_uvector(targets.size(), stream); thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); - auto end = thrust::unique_by_key( + auto const end = thrust::unique_by_key( rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); - auto ucount = static_cast(thrust::distance(first_bytes.begin(), end.first)); + auto const unique_count = + static_cast(thrust::distance(first_bytes.begin(), end.first)); // create output columns auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { @@ -241,56 +224,65 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, stream, mr); }); - auto results_list = - std::vector>(results_iter, results_iter + targets.size()); - auto device_results_list = [&] { + auto results = std::vector>(results_iter, results_iter + targets.size()); + auto d_results = [&] { auto host_results_pointer_iter = - thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) { + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { return results_column->mutable_view().template data(); }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + results_list.size()); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); }(); constexpr cudf::thread_index_type block_size = 256; + constexpr size_type targets_threshold = 16; // for shared-memory size auto d_first_bytes = first_bytes.data(); auto d_indices = indices.data(); auto d_offsets = offsets.data(); - // Smaller strings perform better with a row per string + auto const shared_mem_size = + targets.size() < targets_threshold ? (block_size * targets.size()) : 0; + auto const work_mem_size = + targets.size() < targets_threshold + ? 0 + : (static_cast(targets.size()) * input.size() * cudf::detail::warp_size); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + bool const row_parallel = ((input.null_count() == input.size()) || ((input.chars_size(stream) / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)); if (row_parallel) { + // Smaller strings perform better with a row per string cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; - multi_contains_row_parallel<<>>( - *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + multi_contains_kernel<1> + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); } else { + // Longer strings perform better with a warp per string cudf::detail::grid_1d grid{ static_cast(input.size()) * cudf::detail::warp_size, block_size}; - auto shared_mem_size = block_size * targets.size(); - size_type work_mem_size = 0; - if (shared_mem_size > (16 * block_size)) { // TODO: Need to find a good value for this - shared_mem_size = 0; - work_mem_size = targets.size() * input.size() * cudf::detail::warp_size; - } - auto working_memory = rmm::device_uvector(work_mem_size, stream); - multi_contains_warp_parallel<<>>(*d_strings, - *d_targets, - d_first_bytes, - d_indices, - d_offsets, - ucount, - working_memory.data(), - device_results_list); + multi_contains_kernel + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); } - return std::make_unique
(std::move(results_list)); + return std::make_unique
(std::move(results)); } } // namespace detail From 33d784746246a7d0262bed37e53af829bcd0c3a7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 30 Sep 2024 14:06:54 -0400 Subject: [PATCH 10/23] move tests to find_multiple_tests.cpp --- cpp/benchmarks/string/find_multiple.cpp | 3 +- cpp/src/strings/search/contains_multiple.cu | 2 +- cpp/tests/strings/find_multiple_tests.cpp | 149 ++++++++++++++++++++ cpp/tests/strings/find_tests.cpp | 149 -------------------- 4 files changed, 151 insertions(+), 152 deletions(-) diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp index 8d47a4b03db..f3115677fa4 100644 --- a/cpp/benchmarks/string/find_multiple.cpp +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -59,7 +59,6 @@ static void bench_find_string(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); - state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); if (api == "find") { state.add_global_memory_writes(input.size()); @@ -81,7 +80,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_multiple") .add_string_axis("api", {"find", "contains"}) - .add_int64_axis("targets", {10, 20}) + .add_int64_axis("targets", {10, 20, 40}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index e72cde6ec04..7c63fbf0453 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -236,7 +236,7 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, }(); constexpr cudf::thread_index_type block_size = 256; - constexpr size_type targets_threshold = 16; // for shared-memory size + constexpr size_type targets_threshold = 32; // for shared-memory size auto d_first_bytes = first_bytes.data(); auto d_indices = indices.data(); diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index 41a5940c880..af6c5800f4d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -80,3 +81,151 @@ TEST_F(StringsFindMultipleTest, ErrorTest) // targets cannot have nulls EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); } + +TEST_F(StringsFindMultipleTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindMultipleTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 19714a5afd0..6b2163aeda1 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -198,154 +197,6 @@ TEST_F(StringsFindTest, ContainsLongStrings) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } -TEST_F(StringsFindTest, MultiContains) -{ - constexpr int num_rows = 1024 + 1; - // replicate the following 9 rows: - std::vector s = { - "Héllo, there world and goodbye", - "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", - "the following code snippet demonstrates how to use search for values in an ordered range", - "it returns the last position where value could be inserted without violating the ordering", - "algorithms execution is parallelized as determined by an execution policy. t", - "he this is a continuation of previous row to make sure string boundaries are honored", - "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", - "", - ""}; - - // replicate strings - auto string_itr = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); - - // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... - auto string_v = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return (i + 1) % s.size() != 0; }); - - auto const strings = - cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); - auto strings_view = cudf::strings_column_view(strings); - std::vector match_targets({" the ", "a", "", "é"}); - cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), - match_targets.end()); - auto results = - cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); - - std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; - std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; - std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; - std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; - - auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return bools[i % bools.size()]; }); - return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); - }; - - auto expected_0 = make_bool_col_fn(ret_0); - auto expected_1 = make_bool_col_fn(ret_1); - auto expected_2 = make_bool_col_fn(ret_2); - auto expected_3 = make_bool_col_fn(ret_3); - - auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); -} - -TEST_F(StringsFindTest, MultiContainsMoreTargets) -{ - auto const strings = cudf::test::strings_column_wrapper{ - "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " - "quick brown fox jumped", - "the following code snippet demonstrates how to use search for values in an ordered rangethe " - "following code snippet", - "thé it returns the last position where value could be inserted without violating ordering thé " - "it returns the last position"}; - auto strings_view = cudf::strings_column_view(strings); - std::vector targets({"lazy brown", "non-exist", ""}); - - std::vector> expects; - expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); - expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); - expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); - - std::vector match_targets; - int max_num_targets = 50; - - for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { - match_targets.clear(); - for (int i = 0; i < num_targets; i++) { - match_targets.push_back(targets[i % targets.size()]); - } - - cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), - match_targets.end()); - auto results = cudf::strings::contains_multiple( - strings_view, cudf::strings_column_view(multi_targets_column)); - EXPECT_EQ(results->num_columns(), num_targets); - for (int i = 0; i < num_targets; i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); - } - } -} - -TEST_F(StringsFindTest, MultiContainsLongStrings) -{ - constexpr int num_rows = 1024 + 1; - // replicate the following 7 rows: - std::vector s = { - "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " - "quick brown fox jumped", - "the following code snippet demonstrates how to use search for values in an ordered rangethe " - "following code snippet", - "thé it returns the last position where value could be inserted without violating ordering thé " - "it returns the last position", - "algorithms execution is parallelized as determined by an execution policy. t algorithms " - "execution is parallelized as ", - "he this is a continuation of previous row to make sure string boundaries are honored he this " - "is a continuation of previous row", - "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " - "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", - ""}; - - // replicate strings - auto string_itr = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); - - // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... - auto string_v = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return (i + 1) % s.size() != 0; }); - - auto const strings = - cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); - - auto sv = cudf::strings_column_view(strings); - auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); - auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); - - std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; - std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; - std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; - std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; - std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; - std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; - - auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return bools[i % bools.size()]; }); - return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); - }; - - auto expected_0 = make_bool_col_fn(ret_0); - auto expected_1 = make_bool_col_fn(ret_1); - auto expected_2 = make_bool_col_fn(ret_2); - auto expected_3 = make_bool_col_fn(ret_3); - auto expected_4 = make_bool_col_fn(ret_4); - auto expected_5 = make_bool_col_fn(ret_5); - - auto expected = - cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); -} - TEST_F(StringsFindTest, StartsWith) { cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""}, From 04717e3002f1faca071f82b683a77ceca2c6ff1e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2024 13:30:29 -0400 Subject: [PATCH 11/23] use output directly in row-parallel kernel --- cpp/src/strings/search/contains_multiple.cu | 60 ++++++++++++--------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 7c63fbf0453..8acef1170ce 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -127,8 +127,12 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, // initialize result: set true if target is empty, false otherwise for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { auto const d_target = d_targets.element(target_idx); - auto const begin = bools + (target_idx * tile_size); - thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = d_target.empty(); + } else { + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + } } if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); } @@ -149,25 +153,31 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, while (map_idx < last_idx) { auto const target_idx = d_indices[map_idx++]; auto const bool_idx = (target_idx * tile_size) + lane_idx; - if (!bools[bool_idx]) { // not found before + auto const found = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx]; + if (!found) { // not found before auto const d_target = d_targets.element(target_idx); if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { // first char already checked, so just check the [1, end) chars match auto const tp = d_target.data(); if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { - bools[bool_idx] = true; + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = true; + } else { + bools[bool_idx] = true; + } } } } } } - if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); } - - // reduce the bools for each target to store in the result - for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { - auto const begin = bools + (target_idx * tile_size); - d_results[target_idx][str_idx] = - thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + if constexpr (tile_size == cudf::detail::warp_size) { + __syncwarp(); + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + } } } } // namespace @@ -242,30 +252,30 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, auto d_indices = indices.data(); auto d_offsets = offsets.data(); + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + auto const shared_mem_size = - targets.size() < targets_threshold ? (block_size * targets.size()) : 0; + !row_parallel && (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0; auto const work_mem_size = - targets.size() < targets_threshold + row_parallel || (targets.size() < targets_threshold) ? 0 : (static_cast(targets.size()) * input.size() * cudf::detail::warp_size); auto working_memory = rmm::device_uvector(work_mem_size, stream); - bool const row_parallel = ((input.null_count() == input.size()) || - ((input.chars_size(stream) / (input.size() - input.null_count())) <= - AVG_CHAR_BYTES_THRESHOLD)); if (row_parallel) { // Smaller strings perform better with a row per string cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; multi_contains_kernel<1> - <<>>( - *d_strings, - *d_targets, - d_first_bytes, - d_indices, - d_offsets, - unique_count, - working_memory.data(), - d_results); + <<>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); } else { // Longer strings perform better with a warp per string cudf::detail::grid_1d grid{ From 62a51c69833b70b5f87a76e3653f0574aca506f9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Oct 2024 08:54:17 -0400 Subject: [PATCH 12/23] Update doxygen, exceptions Co-authored-by: Chong Gao --- cpp/include/cudf/strings/find_multiple.hpp | 13 +++++++------ cpp/src/strings/search/contains_multiple.cu | 5 +++-- cpp/src/strings/search/find_multiple.cu | 5 +++-- cpp/tests/strings/find_multiple_tests.cpp | 6 ++++-- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 90ffd6e7df9..2173fc97526 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -28,8 +28,7 @@ namespace strings { */ /** - * @brief Returns a table of columns of boolean values for each string where true indicates - * the target string was found within that string in the provided column + * @brief Searches for the given target strings within each string in the provided column * * Each column in the result table corresponds to the result for the target string at the same * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, @@ -38,7 +37,7 @@ namespace strings { * If the target is not found for a string, false is returned for that entry in the output column. * If the target is an empty string, true is returned for all non-null entries in the output column. * - * Any null string entries return corresponding null entries in the output columns. + * Any null input strings return corresponding null entries in the output columns. * * @code{.pseudo} * input = ["a", "b", "c"] @@ -48,6 +47,8 @@ namespace strings { * column 1: [false, false, true] * @endcode * + * @throw std::invalid_argument if `targets` is empty or contains nulls + * * @param input Strings instance for this operation * @param targets UTF-8 encoded strings to search for in each string in `input` * @param stream CUDA stream used for device memory operations and kernel launches @@ -61,8 +62,8 @@ std::unique_ptr
contains_multiple( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a lists column with character position values where each - * of the target strings are found in each string. + * @brief Searches for the given target strings within each string in the provided column + * and returns the position the targets were found * * The size of the output column is `input.size()`. * Each row of the output column is of size `targets.size()`. @@ -78,7 +79,7 @@ std::unique_ptr
contains_multiple( * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error if `targets` is empty or contains nulls + * @throw , std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 8acef1170ce..ee5519a87fa 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -187,8 +187,9 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); - CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); + CUDF_EXPECTS( + not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument); auto const d_strings = column_device_view::create(input.parent(), stream); auto const d_targets = column_device_view::create(targets.parent(), stream); diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index ec7015878dd..67226b259d4 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -42,8 +42,9 @@ std::unique_ptr find_multiple(strings_column_view const& input, { auto const strings_count = input.size(); auto const targets_count = targets.size(); - CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); - CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); + CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument); + CUDF_EXPECTS( + !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument); auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index af6c5800f4d..3c8483b153d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -76,10 +76,12 @@ TEST_F(StringsFindMultipleTest, ErrorTest) auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); auto empty_view = cudf::strings_column_view(zero_size_strings_column); // targets must have at least one string - EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument); // targets cannot have nulls - EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument); } TEST_F(StringsFindMultipleTest, MultiContains) From ede8a7bdd56d84b42cc70c9984a0b85a841c0c72 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Oct 2024 19:26:22 -0400 Subject: [PATCH 13/23] cleanup code part I --- cpp/include/cudf/strings/find_multiple.hpp | 2 +- cpp/src/strings/search/contains_multiple.cu | 34 ++++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 2173fc97526..6b9cc3723b3 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -53,7 +53,7 @@ namespace strings { * @param targets UTF-8 encoded strings to search for in each string in `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New BOOL8 column + * @return Table of BOOL8 columns */ std::unique_ptr
contains_multiple( strings_column_view const& input, diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index ee5519a87fa..cc91719915b 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -170,13 +170,15 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, } } } - if constexpr (tile_size == cudf::detail::warp_size) { + + if constexpr (tile_size > 1) { __syncwarp(); // reduce the bools for each target to store in the result for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { auto const begin = bools + (target_idx * tile_size); d_results[target_idx][str_idx] = thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + // cooperative group implementation was almost 3x slower than this parallel reduce } } } @@ -247,7 +249,8 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, }(); constexpr cudf::thread_index_type block_size = 256; - constexpr size_type targets_threshold = 32; // for shared-memory size + // calculated (benchmarked) for efficient use of shared-memory + constexpr size_type targets_threshold = 32; auto d_first_bytes = first_bytes.data(); auto d_indices = indices.data(); @@ -257,14 +260,6 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, ((input.chars_size(stream) / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)); - auto const shared_mem_size = - !row_parallel && (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0; - auto const work_mem_size = - row_parallel || (targets.size() < targets_threshold) - ? 0 - : (static_cast(targets.size()) * input.size() * cudf::detail::warp_size); - auto working_memory = rmm::device_uvector(work_mem_size, stream); - if (row_parallel) { // Smaller strings perform better with a row per string cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; @@ -275,13 +270,22 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, d_indices, d_offsets, unique_count, - working_memory.data(), + nullptr, d_results); } else { - // Longer strings perform better with a warp per string - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - multi_contains_kernel + constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; + + auto const shared_mem_size = + (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0; + auto const work_mem_size = + (targets.size() < targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + + // std::cout << shared_mem_size << "," << work_mem_size << std::endl; + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + multi_contains_kernel <<>>( *d_strings, *d_targets, From b5493ec331be0a56487fd75ea68dec1aa7f14527 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 22 Oct 2024 10:24:13 -0400 Subject: [PATCH 14/23] fix threshold check --- cpp/src/strings/search/contains_multiple.cu | 4 ++-- cpp/tests/strings/find_tests.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index cc91719915b..190881c9f34 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -81,8 +81,8 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; * The logic now only needs to check for either of these 2 targets. * * This kernel works in either row-per-string or warp-per-string depending - * on the template parameter. If tile_size==1, then it executes as a - * row-per-string. If tile_size=32, the it executes as a warp-per-string. + * on the template parameter. If tile_size==1, then this kernel executes as + * a row-per-string. If tile_size=32, the it executes as a warp-per-string. * No other options are supported for now. * * @tparam tile_size Number of threads per string diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 6b2163aeda1..a3066c40650 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include From b6ac3ff983de8b0efa517ab3bde5df7f6e0fdc5c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 22 Oct 2024 12:49:56 -0400 Subject: [PATCH 15/23] fix threshold check for real --- cpp/src/strings/search/contains_multiple.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 190881c9f34..ff995c53413 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -276,9 +276,9 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; auto const shared_mem_size = - (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0; + (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0; auto const work_mem_size = - (targets.size() < targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); auto working_memory = rmm::device_uvector(work_mem_size, stream); // std::cout << shared_mem_size << "," << work_mem_size << std::endl; From b038e0c13cdc63711d8d802b65c6f6e853a5324b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Oct 2024 15:14:31 -0400 Subject: [PATCH 16/23] remove commented out debug print --- cpp/include/cudf/strings/find_multiple.hpp | 2 +- cpp/src/strings/search/contains_multiple.cu | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 6b9cc3723b3..03a74166256 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -79,7 +79,7 @@ std::unique_ptr
contains_multiple( * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw , std::invalid_argument if `targets` is empty or contains nulls + * @throw std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index ff995c53413..e4468f4df9a 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -281,8 +281,6 @@ std::unique_ptr
contains_multiple(strings_column_view const& input, (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); auto working_memory = rmm::device_uvector(work_mem_size, stream); - // std::cout << shared_mem_size << "," << work_mem_size << std::endl; - cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, block_size}; multi_contains_kernel From 3c5cd47c92d01dbc252d856741d1592c8ac1ab9f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 31 Oct 2024 10:15:37 -0400 Subject: [PATCH 17/23] fix call to build benchmark input col --- cpp/benchmarks/string/find_multiple.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp index f3115677fa4..514c6a115c3 100644 --- a/cpp/benchmarks/string/find_multiple.cpp +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -26,10 +26,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate); - static void bench_find_string(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -38,16 +34,11 @@ static void bench_find_string(nvbench::state& state) auto const target_count = static_cast(state.get_int64("targets")); auto const api = state.get_string("api"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto const stream = cudf::get_default_stream(); - auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - // Note that these all match the first row of the raw_data in build_input_column. + // Note that these all match the first row of the raw_data in create_string_column. // This is so the hit_rate can properly accounted for. std::vector target_data( {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); @@ -81,6 +72,6 @@ NVBENCH_BENCH(bench_find_string) .set_name("find_multiple") .add_string_axis("api", {"find", "contains"}) .add_int64_axis("targets", {10, 20, 40}) - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {20, 80}); // percentage From 9910bd35154864c1242d53c17a0855b43be7004f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Nov 2024 10:23:56 -0500 Subject: [PATCH 18/23] fix copyright year --- cpp/benchmarks/string/find_multiple.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp index 514c6a115c3..0e780fdb302 100644 --- a/cpp/benchmarks/string/find_multiple.cpp +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ static void bench_find_string(nvbench::state& state) // Note that these all match the first row of the raw_data in create_string_column. // This is so the hit_rate can properly accounted for. - std::vector target_data( + std::vector const target_data( {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); auto h_targets = std::vector{}; for (cudf::size_type i = 0; i < target_count; i++) { From 31d5a4e13daa7a434396114d7dd3ba3f5da60880 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Nov 2024 13:11:36 -0500 Subject: [PATCH 19/23] use cooperative groups for tile-size --- cpp/src/strings/search/contains_multiple.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index e4468f4df9a..5f01f52e60d 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -113,7 +114,9 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, // get the string for this tile auto const d_str = d_strings.element(str_idx); - auto const lane_idx = idx % tile_size; + namespace cg = cooperative_groups; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = tile.thread_rank(); auto const tile_idx = static_cast(threadIdx.x) / tile_size; auto const num_targets = d_targets.size(); @@ -134,7 +137,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); } } - if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); } + tile.sync(); auto const last_ptr = d_first_bytes + unique_count; for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); @@ -178,7 +181,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, auto const begin = bools + (target_idx * tile_size); d_results[target_idx][str_idx] = thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); - // cooperative group implementation was almost 3x slower than this parallel reduce + // cooperative_group any() implementation was almost 3x slower than this parallel reduce } } } From 3b5576bac048d65b063fc7f14e7f27f14e365cb7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Nov 2024 13:37:49 -0500 Subject: [PATCH 20/23] replace syncwarp with tile.sync --- cpp/src/strings/search/contains_multiple.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 5f01f52e60d..66a02a14909 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -175,7 +175,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, } if constexpr (tile_size > 1) { - __syncwarp(); + tile.sync(); // reduce the bools for each target to store in the result for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { auto const begin = bools + (target_idx * tile_size); From c0e96dc25122e93b0ea28b0106c9441701f59969 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 5 Nov 2024 13:50:40 -0500 Subject: [PATCH 21/23] use meta_group_rank --- cpp/src/strings/search/contains_multiple.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index 66a02a14909..ceebdc5b8fa 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -117,15 +117,15 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, namespace cg = cooperative_groups; auto const tile = cg::tiled_partition(cg::this_thread_block()); auto const lane_idx = tile.thread_rank(); - auto const tile_idx = static_cast(threadIdx.x) / tile_size; auto const num_targets = d_targets.size(); // size of shared_bools = num_targets * block_size // each thread uses num_targets bools extern __shared__ bool shared_bools[]; // bools for the current string - auto bools = working_memory == nullptr ? (shared_bools + (tile_idx * tile_size * num_targets)) - : (working_memory + (str_idx * tile_size * num_targets)); + auto bools = working_memory == nullptr + ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); // initialize result: set true if target is empty, false otherwise for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { From 7031dac62f7b088e4a4cb84b1be5ec6d329707b8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 8 Nov 2024 14:36:08 -0500 Subject: [PATCH 22/23] fix comment wording --- cpp/src/strings/search/contains_multiple.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu index ceebdc5b8fa..1183e3e4038 100644 --- a/cpp/src/strings/search/contains_multiple.cu +++ b/cpp/src/strings/search/contains_multiple.cu @@ -81,7 +81,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' * The logic now only needs to check for either of these 2 targets. * - * This kernel works in either row-per-string or warp-per-string depending + * This kernel works in either thread-per-string or warp-per-string depending * on the template parameter. If tile_size==1, then this kernel executes as * a row-per-string. If tile_size=32, the it executes as a warp-per-string. * No other options are supported for now. From 4ef6d46a69f4d3ab8e88253b2ba4e75a423d0ea6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 12 Nov 2024 09:19:52 -0600 Subject: [PATCH 23/23] 1st instead of 1th --- cpp/include/cudf/strings/find_multiple.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 03a74166256..e090766dd07 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -31,7 +31,7 @@ namespace strings { * @brief Searches for the given target strings within each string in the provided column * * Each column in the result table corresponds to the result for the target string at the same - * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st, * etc. * * If the target is not found for a string, false is returned for that entry in the output column.