From e446371bf16373590043474df5c023e610ff012c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 24 Sep 2024 16:28:20 -0400
Subject: [PATCH 01/23] Add cudf::strings::contains_multiple

---
 cpp/CMakeLists.txt                          |   1 +
 cpp/benchmarks/string/find.cpp              |  25 +-
 cpp/include/cudf/strings/find.hpp           |  33 ++
 cpp/src/strings/search/contains_multiple.cu | 330 ++++++++++++++++++++
 cpp/tests/strings/find_tests.cpp            | 153 ++++++++-
 5 files changed, 538 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/strings/search/contains_multiple.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 84b462bb884..38ee5389cf2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -603,6 +603,7 @@ add_library(
   src/strings/replace/replace_slice.cu
   src/strings/reverse.cu
   src/strings/scan/scan_inclusive.cu
+  src/strings/search/contains_multiple.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index a9c620e4bf0..baa34b13e2e 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -73,6 +73,28 @@ static void bench_find_string(nvbench::state& state)
   } else if (api == "contains") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+  } else if (api == "contains_multi") {
+    constexpr int iters = 10;
+    std::vector<std::string> match_targets({" abc",
+                                            "W43",
+                                            "0987 5W43",
+                                            "123 abc",
+                                            "23 abc",
+                                            "3 abc",
+                                            "é",
+                                            "7 5W43",
+                                            "87 5W43",
+                                            "987 5W43"});
+    auto multi_targets = std::vector<std::string>{};
+    for (int i = 0; i < iters; i++) {
+      multi_targets.emplace_back(match_targets[i % match_targets.size()]);
+    }
+    cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(),
+                                                            multi_targets.end());
+
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column));
+    });
   } else if (api == "starts_with") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
@@ -84,7 +106,8 @@ static void bench_find_string(nvbench::state& state)
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
-  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_string_axis("api",
+                   {"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"})
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
   .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
   .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index e024b116a71..497c263cca8 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -163,6 +163,39 @@ std::unique_ptr<column> contains(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns a table of columns of boolean values for each string where true indicates
+ * the target string was found within that string in the provided column
+ *
+ * Each column in the result table corresponds to the result for the target string at the same
+ * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
+ * etc.
+ *
+ * If the target is not found for a string, false is returned for that entry in the output column.
+ * If the target is an empty string, true is returned for all non-null entries in the output column.
+ *
+ * Any null string entries return corresponding null entries in the output columns.
+ *
+ * @code{.pseudo}
+ * input = ["a", "b", "c"]
+ * targets = ["a", "c"]
+ * output is a table with two boolean columns:
+ *   column 0: [true, false, false]
+ *   column 1: [false, false, true]
+ * @endcode
+ *
+ * @param input Strings instance for this operation
+ * @param targets UTF-8 encoded strings to search for in each string in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
+ */
+std::unique_ptr<table> contains_multiple(
+  strings_column_view const& input,
+  strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the target string was found at the beginning of that string in the provided column.
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
new file mode 100644
index 00000000000..e4673016e28
--- /dev/null
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cub/cub.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/unique.h>
+
+#include <algorithm>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * a warp-parallel function is used.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
+CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings,
+                                              column_device_view const d_targets,
+                                              u_char const* d_first_bytes,
+                                              size_type const* d_indices,
+                                              size_type const* d_offsets,
+                                              size_type unique_count,
+                                              cudf::device_span<bool*> d_results)
+{
+  auto const num_targets = d_targets.size();
+  auto const num_rows    = d_strings.size();
+  auto const idx         = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx     = idx / cudf::detail::warp_size;
+  if (str_idx >= num_rows) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+  // get the string for this warp
+  auto const d_str = d_strings.element<string_view>(str_idx);
+
+  // size of shared_bools = targets_size * block_size
+  // each thread uses targets_size bools
+  extern __shared__ bool shared_bools[];
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  // initialize result: set true if target is empty, false otherwise
+  for (int target_idx = 0; target_idx < num_targets; target_idx++) {
+    auto const d_target = d_targets.element<string_view>(target_idx);
+    shared_bools[threadIdx.x * num_targets + target_idx] = d_target.empty();
+  }
+
+  auto const last_ptr = d_first_bytes + unique_count;
+  for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes();
+       str_byte_idx += cudf::detail::warp_size) {
+    // search for byte in first_bytes array
+    auto const chr      = static_cast<u_char>(*(d_str.data() + str_byte_idx));
+    auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
+    // if not found, continue to next byte
+    if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
+    // compute index of matched byte
+    auto offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx    = d_offsets[offset_idx];
+    auto const last_idx =
+      (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count;
+    // check for targets that begin with chr
+    while ((map_idx < num_targets) && (offset_idx < last_idx)) {
+      auto target_idx     = d_indices[map_idx++];
+      int temp_result_idx = threadIdx.x * num_targets + target_idx;
+      if (!shared_bools[temp_result_idx]) {  // not found before
+        auto const d_target = d_targets.element<string_view>(target_idx);
+        if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
+          // first char already checked, only need to check the [2nd, end) chars if has.
+          bool found = true;
+          for (auto i = 1; i < d_target.size_bytes() && found; i++) {
+            if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; }
+          }
+          if (found) { shared_bools[temp_result_idx] = true; }
+        }
+      }
+      ++offset_idx;
+    }
+  }
+
+  // wait all lanes are done in a warp
+  __syncwarp();
+
+  if (lane_idx == 0) {
+    for (int target_idx = 0; target_idx < num_targets; target_idx++) {
+      bool found = false;
+      for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) {
+        int temp_idx = (threadIdx.x + lane_idx) * num_targets + target_idx;
+        if (shared_bools[temp_idx]) {
+          found = true;
+          break;
+        }
+      }
+      d_results[target_idx][str_idx] = found;
+    }
+  }
+}
+
+CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
+                                             column_device_view const d_targets,
+                                             u_char const* d_first_bytes,
+                                             size_type const* d_indices,
+                                             size_type const* d_offsets,
+                                             size_type unique_count,
+                                             cudf::device_span<bool*> d_results)
+{
+  auto const str_idx     = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
+  auto const num_targets = d_targets.size();
+  auto const num_rows    = d_strings.size();
+  if (str_idx >= num_rows) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<string_view>(str_idx);
+
+  // initialize output; the result of searching empty target is true
+  for (auto target_idx = 0; target_idx < num_targets; ++target_idx) {
+    auto const d_target            = d_targets.element<string_view>(target_idx);
+    d_results[target_idx][str_idx] = d_target.empty();
+  }
+
+  // process each byte of the current string
+  auto const last_ptr = d_first_bytes + unique_count;
+  for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) {
+    // search for byte in first_bytes array
+    auto const chr      = static_cast<u_char>(*(d_str.data() + str_byte_idx));
+    auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
+    // if not found, continue to next byte
+    if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
+    // compute index of matched byte
+    auto offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx    = d_offsets[offset_idx];
+    auto const last_idx =
+      (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count;
+    // check for targets that begin with chr
+    while ((map_idx < num_targets) && (offset_idx < last_idx)) {
+      auto target_idx = d_indices[map_idx++];
+      if (!d_results[target_idx][str_idx]) {  // not found before
+        auto const d_target = d_targets.element<string_view>(target_idx);
+        if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
+          // first char already checked, only need to check the [2nd, end) chars
+          bool found = true;
+          for (auto i = 1; i < d_target.size_bytes() && found; i++) {
+            if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; }
+          }
+          if (found) { d_results[target_idx][str_idx] = true; }
+        }
+      }
+      ++offset_idx;
+    }
+  }
+}
+
+std::vector<std::unique_ptr<column>> multi_contains(bool warp_parallel,
+                                                    strings_column_view const& input,
+                                                    strings_column_view const& targets,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const num_targets = static_cast<size_type>(targets.size());
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_targets = column_device_view::create(targets.parent(), stream);
+
+  // copy the first byte of each target and sort the first bytes
+  auto first_bytes = rmm::device_uvector<u_char>(targets.size(), stream);
+  auto indices     = rmm::device_uvector<size_type>(targets.size(), stream);
+  {
+    auto tgt_itr = thrust::make_transform_iterator(
+      d_targets->begin<string_view>(), [] __device__(auto const& d_tgt) -> u_char {
+        return d_tgt.empty() ? u_char{0} : static_cast<u_char>(d_tgt.data()[0]);
+      });
+    auto count_itr = thrust::make_counting_iterator<size_type>(0);
+    auto keys_out  = first_bytes.begin();
+    auto vals_out  = indices.begin();
+    auto cmp_op    = thrust::less();
+    auto sv        = stream.value();
+
+    std::size_t tmp_bytes = 0;
+    cub::DeviceMergeSort::SortPairsCopy(
+      nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv);
+    auto tmp_stg = rmm::device_buffer(tmp_bytes, stream);
+    cub::DeviceMergeSort::SortPairsCopy(
+      tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv);
+  }
+
+  // remove duplicates to speed up lower_bound
+  auto offsets = rmm::device_uvector<size_type>(targets.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end());
+  auto end = thrust::unique_by_key(
+    rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin());
+  auto ucount = static_cast<size_type>(thrust::distance(first_bytes.begin(), end.first));
+
+  // create output columns
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return make_numeric_column(data_type{type_id::BOOL8},
+                               input.size(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                               input.null_count(),
+                               stream,
+                               mr);
+  });
+  auto results_list =
+    std::vector<std::unique_ptr<column>>(results_iter, results_iter + targets.size());
+  auto device_results_list = [&] {
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<bool>();
+      });
+    auto host_results_pointers = std::vector<bool*>(
+      host_results_pointer_iter, host_results_pointer_iter + results_list.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
+  }();
+
+  constexpr cudf::thread_index_type block_size = 256;
+
+  auto d_first_bytes = first_bytes.data();
+  auto d_indices     = indices.data();
+  auto d_offsets     = offsets.data();
+
+  if (warp_parallel) {
+    cudf::detail::grid_1d grid{
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+    int shared_mem_size = block_size * targets.size();
+    multi_contains_warp_parallel<<<grid.num_blocks,
+                                   grid.num_threads_per_block,
+                                   shared_mem_size,
+                                   stream.value()>>>(
+      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
+  } else {
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
+    multi_contains_row_parallel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
+  }
+
+  return results_list;
+}
+
+}  // namespace
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& input,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string.");
+  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null");
+
+  if ((input.null_count() == input.size()) ||
+      ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+       AVG_CHAR_BYTES_THRESHOLD)) {
+    // Small strings. Searching for multiple targets in one thread seems to work fastest.
+    return std::make_unique<table>(
+      multi_contains(/**warp parallel**/ false, input, targets, stream, mr));
+  }
+
+  // Long strings
+  // Use warp parallel when the average string width is greater than the threshold
+  static constexpr size_type target_group_size = 16;  // perhaps can be calculated
+  if (targets.size() <= target_group_size) {
+    return std::make_unique<table>(
+      multi_contains(/**warp parallel**/ true, input, targets, stream, mr));
+  }
+
+  // Too many targets will consume more shared memory, so split targets
+  // TODO: test with large working memory (instead of shared-memory)
+  std::vector<std::unique_ptr<column>> ret_columns;
+  auto const num_groups = cudf::util::div_rounding_up_safe(targets.size(), target_group_size);
+  for (size_type group_idx = 0; group_idx < num_groups; group_idx++) {
+    auto const start_target = group_idx * target_group_size;
+    auto const end_target   = std::min(start_target + target_group_size, targets.size());
+
+    auto target_group = cudf::detail::slice(targets.parent(), start_target, end_target, stream);
+    auto bool_columns = multi_contains(
+      /**warp parallel**/ true, input, strings_column_view(target_group), stream, mr);
+    for (auto& c : bool_columns) {
+      ret_columns.push_back(std::move(c));  // transfer ownership
+    }
+  }
+  return std::make_unique<table>(std::move(ret_columns));
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& strings,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_multiple(strings, targets, stream, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 2da95ba5c27..698fea50bad 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -17,16 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <vector>
 
 struct StringsFindTest : public cudf::test::BaseFixture {};
@@ -198,6 +197,154 @@ TEST_F(StringsFindTest, ContainsLongStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsFindTest, MultiContains)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 9 rows:
+  std::vector<std::string> s = {
+    "Héllo, there world and goodbye",
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
+    "the following code snippet demonstrates how to use search for values in an ordered range",
+    "it returns the last position where value could be inserted without violating the ordering",
+    "algorithms execution is parallelized as determined by an execution policy. t",
+    "he this is a continuation of previous row to make sure string boundaries are honored",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
+    "",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> match_targets({" the ", "a", "", "é"});
+  cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                          match_targets.end());
+  auto results =
+    cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column));
+
+  std::vector<bool> ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+
+  auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(StringsFindTest, MultiContainsMoreTargets)
+{
+  auto const strings = cudf::test::strings_column_wrapper{
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position"};
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> targets({"lazy brown", "non-exist", ""});
+
+  std::vector<cudf::test::fixed_width_column_wrapper<bool>> expects;
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1}));
+
+  std::vector<std::string> match_targets;
+  int max_num_targets = 50;
+
+  for (int num_targets = 1; num_targets < max_num_targets; num_targets++) {
+    match_targets.clear();
+    for (int i = 0; i < num_targets; i++) {
+      match_targets.push_back(targets[i % targets.size()]);
+    }
+
+    cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                            match_targets.end());
+    auto results = cudf::strings::contains_multiple(
+      strings_view, cudf::strings_column_view(multi_targets_column));
+    EXPECT_EQ(results->num_columns(), num_targets);
+    for (int i = 0; i < num_targets; i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]);
+    }
+  }
+}
+
+TEST_F(StringsFindTest, MultiContainsLongStrings)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 7 rows:
+  std::vector<std::string> s = {
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position",
+    "algorithms execution is parallelized as determined by an execution policy. t algorithms "
+    "execution is parallelized as ",
+    "he this is a continuation of previous row to make sure string boundaries are honored he this "
+    "is a continuation of previous row",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ "
+    "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+
+  auto sv      = cudf::strings_column_view(strings);
+  auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "});
+  auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets));
+
+  std::vector<bool> ret_0 = {1, 0, 1, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {0, 0, 0, 0, 1, 0, 0};
+  std::vector<bool> ret_4 = {1, 0, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_5 = {0, 0, 1, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+  auto expected_4 = make_bool_col_fn(ret_4);
+  auto expected_5 = make_bool_col_fn(ret_5);
+
+  auto expected =
+    cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
+
 TEST_F(StringsFindTest, StartsWith)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},

From 779ca646fe7c43207d935ed96d59f133def85cad Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 24 Sep 2024 16:55:04 -0400
Subject: [PATCH 02/23] add proclaim_return_type

---
 cpp/src/strings/search/contains_multiple.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index e4673016e28..c8204ff5f49 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -33,6 +33,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -202,9 +203,10 @@ std::vector<std::unique_ptr<column>> multi_contains(bool warp_parallel,
   auto indices     = rmm::device_uvector<size_type>(targets.size(), stream);
   {
     auto tgt_itr = thrust::make_transform_iterator(
-      d_targets->begin<string_view>(), [] __device__(auto const& d_tgt) -> u_char {
+      d_targets->begin<string_view>(),
+      cuda::proclaim_return_type<u_char>([] __device__(auto const& d_tgt) -> u_char {
         return d_tgt.empty() ? u_char{0} : static_cast<u_char>(d_tgt.data()[0]);
-      });
+      }));
     auto count_itr = thrust::make_counting_iterator<size_type>(0);
     auto keys_out  = first_bytes.begin();
     auto vals_out  = indices.begin();

From f19cb1c683facfc92a8f8dcd4ba267661de27bca Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 24 Sep 2024 17:41:46 -0400
Subject: [PATCH 03/23] fix while-loop check

---
 cpp/src/strings/search/contains_multiple.cu | 32 +++++++++------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index c8204ff5f49..ae05f75389f 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -65,10 +65,9 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
                                               cudf::device_span<bool*> d_results)
 {
   auto const num_targets = d_targets.size();
-  auto const num_rows    = d_strings.size();
   auto const idx         = cudf::detail::grid_1d::global_thread_id();
   auto const str_idx     = idx / cudf::detail::warp_size;
-  if (str_idx >= num_rows) { return; }
+  if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
@@ -93,14 +92,13 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
     // if not found, continue to next byte
     if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
     // compute index of matched byte
-    auto offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
-    auto map_idx    = d_offsets[offset_idx];
-    auto const last_idx =
-      (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count;
+    auto offset_idx     = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx        = d_offsets[offset_idx];
+    auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
     // check for targets that begin with chr
-    while ((map_idx < num_targets) && (offset_idx < last_idx)) {
-      auto target_idx     = d_indices[map_idx++];
-      int temp_result_idx = threadIdx.x * num_targets + target_idx;
+    while (map_idx < last_idx) {
+      auto const target_idx      = d_indices[map_idx++];
+      auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx;
       if (!shared_bools[temp_result_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
         if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
@@ -112,7 +110,6 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
           if (found) { shared_bools[temp_result_idx] = true; }
         }
       }
-      ++offset_idx;
     }
   }
 
@@ -144,8 +141,7 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
 {
   auto const str_idx     = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
   auto const num_targets = d_targets.size();
-  auto const num_rows    = d_strings.size();
-  if (str_idx >= num_rows) { return; }
+  if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
   auto const d_str = d_strings.element<string_view>(str_idx);
 
@@ -164,13 +160,12 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
     // if not found, continue to next byte
     if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
     // compute index of matched byte
-    auto offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
-    auto map_idx    = d_offsets[offset_idx];
-    auto const last_idx =
-      (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count;
+    auto offset_idx     = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx        = d_offsets[offset_idx];
+    auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
     // check for targets that begin with chr
-    while ((map_idx < num_targets) && (offset_idx < last_idx)) {
-      auto target_idx = d_indices[map_idx++];
+    while (map_idx < last_idx) {
+      auto const target_idx = d_indices[map_idx++];
       if (!d_results[target_idx][str_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
         if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
@@ -182,7 +177,6 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
           if (found) { d_results[target_idx][str_idx] = true; }
         }
       }
-      ++offset_idx;
     }
   }
 }

From 65e21d3f05c48d87e85711e884fe12630fd008d7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 26 Sep 2024 11:54:05 -0400
Subject: [PATCH 04/23] cleanup code

---
 cpp/src/strings/search/contains_multiple.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index ae05f75389f..9e6ba688c2f 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -101,7 +101,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
       auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx;
       if (!shared_bools[temp_result_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
-        if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
+        if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
           // first char already checked, only need to check the [2nd, end) chars if has.
           bool found = true;
           for (auto i = 1; i < d_target.size_bytes() && found; i++) {
@@ -119,12 +119,10 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
   if (lane_idx == 0) {
     for (int target_idx = 0; target_idx < num_targets; target_idx++) {
       bool found = false;
-      for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) {
-        int temp_idx = (threadIdx.x + lane_idx) * num_targets + target_idx;
-        if (shared_bools[temp_idx]) {
-          found = true;
-          break;
-        }
+      // use thrust::any() algorithm with strided iterator?
+      for (size_type lidx = 0; lidx < cudf::detail::warp_size && !found; lidx++) {
+        auto const temp_idx = ((threadIdx.x + lidx) * num_targets) + target_idx;
+        if (shared_bools[temp_idx]) { found = true; }
       }
       d_results[target_idx][str_idx] = found;
     }
@@ -168,7 +166,7 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
       auto const target_idx = d_indices[map_idx++];
       if (!d_results[target_idx][str_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
-        if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) {
+        if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
           // first char already checked, only need to check the [2nd, end) chars
           bool found = true;
           for (auto i = 1; i < d_target.size_bytes() && found; i++) {

From b9da939a5dff69adcf1e35ee992e0d15134e538d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 26 Sep 2024 17:21:01 -0400
Subject: [PATCH 05/23] change shared memory layout

---
 cpp/src/strings/search/contains_multiple.cu | 48 +++++++++++----------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 9e6ba688c2f..f154fb0a89c 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -38,6 +38,7 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
 
@@ -72,15 +73,21 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
 
-  // size of shared_bools = targets_size * block_size
-  // each thread uses targets_size bools
+  // size of shared_bools = num_targets * block_size
+  // each thread uses num_targets bools
   extern __shared__ bool shared_bools[];
   auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const warp_idx = threadIdx.x / cudf::detail::warp_size;
+  // bools for the current string
+  auto bools = shared_bools + (warp_idx * cudf::detail::warp_size * num_targets);
 
   // initialize result: set true if target is empty, false otherwise
-  for (int target_idx = 0; target_idx < num_targets; target_idx++) {
+  for (auto target_idx = lane_idx; target_idx < num_targets;
+       target_idx += cudf::detail::warp_size) {
     auto const d_target = d_targets.element<string_view>(target_idx);
-    shared_bools[threadIdx.x * num_targets + target_idx] = d_target.empty();
+    auto const begin    = bools + (target_idx * cudf::detail::warp_size);
+    thrust::uninitialized_fill(
+      thrust::seq, begin, begin + cudf::detail::warp_size, d_target.empty());
   }
 
   auto const last_ptr = d_first_bytes + unique_count;
@@ -92,14 +99,14 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
     // if not found, continue to next byte
     if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
     // compute index of matched byte
-    auto offset_idx     = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
-    auto map_idx        = d_offsets[offset_idx];
+    auto const offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx          = d_offsets[offset_idx];
     auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
     // check for targets that begin with chr
     while (map_idx < last_idx) {
-      auto const target_idx      = d_indices[map_idx++];
-      auto const temp_result_idx = (threadIdx.x * num_targets) + target_idx;
-      if (!shared_bools[temp_result_idx]) {  // not found before
+      auto const target_idx = d_indices[map_idx++];
+      auto const bool_idx   = (target_idx * cudf::detail::warp_size) + lane_idx;
+      if (!bools[bool_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
         if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
           // first char already checked, only need to check the [2nd, end) chars if has.
@@ -107,7 +114,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
           for (auto i = 1; i < d_target.size_bytes() && found; i++) {
             if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; }
           }
-          if (found) { shared_bools[temp_result_idx] = true; }
+          if (found) { bools[bool_idx] = true; }
         }
       }
     }
@@ -116,16 +123,13 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
   // wait all lanes are done in a warp
   __syncwarp();
 
-  if (lane_idx == 0) {
-    for (int target_idx = 0; target_idx < num_targets; target_idx++) {
-      bool found = false;
-      // use thrust::any() algorithm with strided iterator?
-      for (size_type lidx = 0; lidx < cudf::detail::warp_size && !found; lidx++) {
-        auto const temp_idx = ((threadIdx.x + lidx) * num_targets) + target_idx;
-        if (shared_bools[temp_idx]) { found = true; }
-      }
-      d_results[target_idx][str_idx] = found;
-    }
+  // reduce the bools for each target to store in the result
+  for (auto target_idx = lane_idx; target_idx < num_targets;
+       target_idx += cudf::detail::warp_size) {
+    auto begin = bools + (target_idx * cudf::detail::warp_size);
+    auto found =
+      thrust::any_of(thrust::seq, begin, begin + cudf::detail::warp_size, thrust::identity<bool>{});
+    d_results[target_idx][str_idx] = found;
   }
 }
 
@@ -158,8 +162,8 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
     // if not found, continue to next byte
     if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
     // compute index of matched byte
-    auto offset_idx     = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
-    auto map_idx        = d_offsets[offset_idx];
+    auto const offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx          = d_offsets[offset_idx];
     auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
     // check for targets that begin with chr
     while (map_idx < last_idx) {

From e8cb6cf8eb0441564e6f1c191535a3709c7fb74b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 26 Sep 2024 17:52:03 -0400
Subject: [PATCH 06/23] use global memory if shared memory limit is reached

---
 cpp/src/strings/search/contains_multiple.cu | 104 ++++++++------------
 1 file changed, 42 insertions(+), 62 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index f154fb0a89c..a88c9848c24 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -63,6 +63,7 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
                                               size_type const* d_indices,
                                               size_type const* d_offsets,
                                               size_type unique_count,
+                                              bool* working_memory,
                                               cudf::device_span<bool*> d_results)
 {
   auto const num_targets = d_targets.size();
@@ -73,13 +74,16 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
 
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
   // size of shared_bools = num_targets * block_size
   // each thread uses num_targets bools
   extern __shared__ bool shared_bools[];
-  auto const lane_idx = idx % cudf::detail::warp_size;
   auto const warp_idx = threadIdx.x / cudf::detail::warp_size;
   // bools for the current string
-  auto bools = shared_bools + (warp_idx * cudf::detail::warp_size * num_targets);
+  auto bools = working_memory == nullptr
+                 ? (shared_bools + (warp_idx * cudf::detail::warp_size * num_targets))
+                 : (working_memory + (str_idx * cudf::detail::warp_size * num_targets));
 
   // initialize result: set true if target is empty, false otherwise
   for (auto target_idx = lane_idx; target_idx < num_targets;
@@ -183,18 +187,22 @@ CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
   }
 }
 
-std::vector<std::unique_ptr<column>> multi_contains(bool warp_parallel,
-                                                    strings_column_view const& input,
-                                                    strings_column_view const& targets,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+}  // namespace
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& input,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
-  auto const num_targets = static_cast<size_type>(targets.size());
+  CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string.");
+  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null");
+
+  auto const num_targets = targets.size();
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_targets = column_device_view::create(targets.parent(), stream);
 
-  // copy the first byte of each target and sort the first bytes
+  // copy the first byte of each target and sort them
   auto first_bytes = rmm::device_uvector<u_char>(targets.size(), stream);
   auto indices     = rmm::device_uvector<size_type>(targets.size(), stream);
   {
@@ -251,66 +259,38 @@ std::vector<std::unique_ptr<column>> multi_contains(bool warp_parallel,
   auto d_indices     = indices.data();
   auto d_offsets     = offsets.data();
 
-  if (warp_parallel) {
+  // Smaller strings perform better with a row per string
+  bool const row_parallel = ((input.null_count() == input.size()) ||
+                             ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+                              AVG_CHAR_BYTES_THRESHOLD));
+  if (row_parallel) {
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
+    multi_contains_row_parallel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
+  } else {
     cudf::detail::grid_1d grid{
       static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-    int shared_mem_size = block_size * targets.size();
+    auto shared_mem_size    = block_size * targets.size();
+    size_type work_mem_size = 0;
+    if (shared_mem_size > 12000) {  // TODO: Need to find a good value for this
+      shared_mem_size = 0;
+      work_mem_size   = targets.size() * input.size() * cudf::detail::warp_size;
+    }
+    auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
     multi_contains_warp_parallel<<<grid.num_blocks,
                                    grid.num_threads_per_block,
                                    shared_mem_size,
-                                   stream.value()>>>(
-      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
-  } else {
-    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
-    multi_contains_row_parallel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
-  }
-
-  return results_list;
-}
-
-}  // namespace
-
-std::unique_ptr<table> contains_multiple(strings_column_view const& input,
-                                         strings_column_view const& targets,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string.");
-  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null");
-
-  if ((input.null_count() == input.size()) ||
-      ((input.chars_size(stream) / (input.size() - input.null_count())) <=
-       AVG_CHAR_BYTES_THRESHOLD)) {
-    // Small strings. Searching for multiple targets in one thread seems to work fastest.
-    return std::make_unique<table>(
-      multi_contains(/**warp parallel**/ false, input, targets, stream, mr));
+                                   stream.value()>>>(*d_strings,
+                                                     *d_targets,
+                                                     d_first_bytes,
+                                                     d_indices,
+                                                     d_offsets,
+                                                     ucount,
+                                                     working_memory.data(),
+                                                     device_results_list);
   }
 
-  // Long strings
-  // Use warp parallel when the average string width is greater than the threshold
-  static constexpr size_type target_group_size = 16;  // perhaps can be calculated
-  if (targets.size() <= target_group_size) {
-    return std::make_unique<table>(
-      multi_contains(/**warp parallel**/ true, input, targets, stream, mr));
-  }
-
-  // Too many targets will consume more shared memory, so split targets
-  // TODO: test with large working memory (instead of shared-memory)
-  std::vector<std::unique_ptr<column>> ret_columns;
-  auto const num_groups = cudf::util::div_rounding_up_safe(targets.size(), target_group_size);
-  for (size_type group_idx = 0; group_idx < num_groups; group_idx++) {
-    auto const start_target = group_idx * target_group_size;
-    auto const end_target   = std::min(start_target + target_group_size, targets.size());
-
-    auto target_group = cudf::detail::slice(targets.parent(), start_target, end_target, stream);
-    auto bool_columns = multi_contains(
-      /**warp parallel**/ true, input, strings_column_view(target_group), stream, mr);
-    for (auto& c : bool_columns) {
-      ret_columns.push_back(std::move(c));  // transfer ownership
-    }
-  }
-  return std::make_unique<table>(std::move(ret_columns));
+  return std::make_unique<table>(std::move(results_list));
 }
 
 }  // namespace detail

From 5cc9d54e0b71336740e72f5fa3d488cd42e47f91 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 26 Sep 2024 18:05:05 -0400
Subject: [PATCH 07/23] change shared-memory threshold

---
 cpp/src/strings/search/contains_multiple.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index a88c9848c24..52a046e7cf0 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -272,7 +272,7 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
       static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
     auto shared_mem_size    = block_size * targets.size();
     size_type work_mem_size = 0;
-    if (shared_mem_size > 12000) {  // TODO: Need to find a good value for this
+    if (shared_mem_size > (16 * block_size)) {  // TODO: Need to find a good value for this
       shared_mem_size = 0;
       work_mem_size   = targets.size() * input.size() * cudf::detail::warp_size;
     }

From 4acb531c655d1c35285493f4edb465dcc2a41a32 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 27 Sep 2024 08:19:50 -0400
Subject: [PATCH 08/23] factor out benchmarks for find/contains_multiple

---
 cpp/benchmarks/CMakeLists.txt               |  1 +
 cpp/benchmarks/string/find.cpp              | 38 +--------
 cpp/benchmarks/string/find_multiple.cpp     | 87 +++++++++++++++++++++
 cpp/include/cudf/strings/find.hpp           | 33 --------
 cpp/include/cudf/strings/find_multiple.hpp  | 33 ++++++++
 cpp/src/strings/search/contains_multiple.cu |  2 +-
 cpp/tests/strings/find_tests.cpp            |  1 +
 7 files changed, 126 insertions(+), 69 deletions(-)
 create mode 100644 cpp/benchmarks/string/find_multiple.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..c8da31544fc 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -373,6 +373,7 @@ ConfigureNVBench(
   string/count.cpp
   string/extract.cpp
   string/find.cpp
+  string/find_multiple.cpp
   string/gather.cpp
   string/join_strings.cpp
   string/lengths.cpp
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index baa34b13e2e..2d5b6963a75 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -19,11 +19,8 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
-#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -49,15 +46,13 @@ static void bench_find_string(nvbench::state& state)
   auto const col    = build_input_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
-  cudf::string_scalar target(h_targets[2]);
-  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+  cudf::string_scalar target("0987 5W43");
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  if (api.substr(0, 4) == "find") {
+  if (api == "find") {
     state.add_global_memory_writes<nvbench::int32_t>(input.size());
   } else {
     state.add_global_memory_writes<nvbench::int8_t>(input.size());
@@ -66,35 +61,9 @@ static void bench_find_string(nvbench::state& state)
   if (api == "find") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
-  } else if (api == "find_multi") {
-    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-    });
   } else if (api == "contains") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
-  } else if (api == "contains_multi") {
-    constexpr int iters = 10;
-    std::vector<std::string> match_targets({" abc",
-                                            "W43",
-                                            "0987 5W43",
-                                            "123 abc",
-                                            "23 abc",
-                                            "3 abc",
-                                            "é",
-                                            "7 5W43",
-                                            "87 5W43",
-                                            "987 5W43"});
-    auto multi_targets = std::vector<std::string>{};
-    for (int i = 0; i < iters; i++) {
-      multi_targets.emplace_back(match_targets[i % match_targets.size()]);
-    }
-    cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(),
-                                                            multi_targets.end());
-
-    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column));
-    });
   } else if (api == "starts_with") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
@@ -106,8 +75,7 @@ static void bench_find_string(nvbench::state& state)
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
-  .add_string_axis("api",
-                   {"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"})
+  .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
   .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
   .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
new file mode 100644
index 00000000000..8d47a4b03db
--- /dev/null
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate);
+
+static void bench_find_string(nvbench::state& state)
+{
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width    = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
+  auto const api          = state.get_string("api");
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto const stream = cudf::get_default_stream();
+  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
+
+  // Note that these all match the first row of the raw_data in build_input_column.
+  // This is so the hit_rate can properly accounted for.
+  std::vector<std::string> target_data(
+    {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
+  auto h_targets = std::vector<std::string>{};
+  for (cudf::size_type i = 0; i < target_count; i++) {
+    h_targets.emplace_back(target_data[i % target_data.size()]);
+  }
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
+  }
+
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::contains_multiple(input, cudf::strings_column_view(targets));
+    });
+  }
+}
+
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_multiple")
+  .add_string_axis("api", {"find", "contains"})
+  .add_int64_axis("targets", {10, 20})
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index 497c263cca8..e024b116a71 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -163,39 +163,6 @@ std::unique_ptr<column> contains(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
-/**
- * @brief Returns a table of columns of boolean values for each string where true indicates
- * the target string was found within that string in the provided column
- *
- * Each column in the result table corresponds to the result for the target string at the same
- * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
- * etc.
- *
- * If the target is not found for a string, false is returned for that entry in the output column.
- * If the target is an empty string, true is returned for all non-null entries in the output column.
- *
- * Any null string entries return corresponding null entries in the output columns.
- *
- * @code{.pseudo}
- * input = ["a", "b", "c"]
- * targets = ["a", "c"]
- * output is a table with two boolean columns:
- *   column 0: [true, false, false]
- *   column 1: [false, false, true]
- * @endcode
- *
- * @param input Strings instance for this operation
- * @param targets UTF-8 encoded strings to search for in each string in `input`
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New BOOL8 column
- */
-std::unique_ptr<table> contains_multiple(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the target string was found at the beginning of that string in the provided column.
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 1fe446db8da..90ffd6e7df9 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -27,6 +27,39 @@ namespace strings {
  * @file
  */
 
+/**
+ * @brief Returns a table of columns of boolean values for each string where true indicates
+ * the target string was found within that string in the provided column
+ *
+ * Each column in the result table corresponds to the result for the target string at the same
+ * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
+ * etc.
+ *
+ * If the target is not found for a string, false is returned for that entry in the output column.
+ * If the target is an empty string, true is returned for all non-null entries in the output column.
+ *
+ * Any null string entries return corresponding null entries in the output columns.
+ *
+ * @code{.pseudo}
+ * input = ["a", "b", "c"]
+ * targets = ["a", "c"]
+ * output is a table with two boolean columns:
+ *   column 0: [true, false, false]
+ *   column 1: [false, false, true]
+ * @endcode
+ *
+ * @param input Strings instance for this operation
+ * @param targets UTF-8 encoded strings to search for in each string in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
+ */
+std::unique_ptr<table> contains_multiple(
+  strings_column_view const& input,
+  strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a lists column with character position values where each
  * of the target strings are found in each string.
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 52a046e7cf0..1921fc8e0ea 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 698fea50bad..19714a5afd0 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -24,6 +24,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>

From f3a3f249e9955473d83e667bb0ba902d78ac6608 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 27 Sep 2024 11:11:51 -0400
Subject: [PATCH 09/23] refactor kernels into templated one

---
 cpp/src/strings/search/contains_multiple.cu | 246 ++++++++++----------
 1 file changed, 119 insertions(+), 127 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 1921fc8e0ea..e72cde6ec04 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -15,7 +15,6 @@
  */
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -35,6 +34,7 @@
 #include <cub/cub.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
+#include <thrust/equal.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -42,7 +42,7 @@
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
 
-#include <algorithm>
+#include <vector>
 
 namespace cudf {
 namespace strings {
@@ -57,48 +57,87 @@ namespace {
  */
 constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 
-CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings,
-                                              column_device_view const d_targets,
-                                              u_char const* d_first_bytes,
-                                              size_type const* d_indices,
-                                              size_type const* d_offsets,
-                                              size_type unique_count,
-                                              bool* working_memory,
-                                              cudf::device_span<bool*> d_results)
+/**
+ * @brief Kernel for finding multiple targets in each row of input strings
+ *
+ * The d_first_bytes is sorted and unique so the d_indices and d_offsets
+ * are used to map the corresponding character to its d_targets entry.
+ *
+ * Example
+ * d_targets = ["foo", "hello", "world", "hi"]
+ *  - sorted first-chars: ['f','h','h','w']
+ * d_indices = [0, 3, 1, 2]
+ * d_first_bytes = ['f', 'h', 'w']   (unique)
+ * d_offsets = [0, 1, 3]
+ * unique_count = 3
+ *
+ * If 'h' is found, lower_bound produces pos=1 in d_first_bytes.
+ * This corresponds to d_offset[1]==1 which has two values:
+ * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2.
+ * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential
+ * in the d_indices array:
+ * - tgt1_idx = d_indices[map_idx]   = 3 --> d_targets[3] == 'hi'
+ * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello'
+ * The logic now only needs to check for either of these 2 targets.
+ *
+ * This kernel works in either row-per-string or warp-per-string depending
+ * on the template parameter. If tile_size==1, then it executes as a
+ * row-per-string. If tile_size=32, the it executes as a warp-per-string.
+ * No other options are supported for now.
+ *
+ * @tparam tile_size Number of threads per string
+ * @param d_strings Input strings
+ * @param d_targets Target strings to search within input strings
+ * @param d_first_bytes Sorted, unique list of first bytes of the target strings
+ * @param d_indices Indices to map sorted d_first_bytes to d_targets
+ * @param d_offsets Offsets to map d_indices to d_targets
+ * @param unique_count Number of unique values in d_first_bytes (and d_offsets)
+ * @param working_memory Global memory to use if shared-memory is too small
+ * @param d_results Bool results for each target within each string row
+ */
+template <cudf::thread_index_type tile_size>
+CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
+                                       column_device_view const d_targets,
+                                       u_char const* d_first_bytes,
+                                       size_type const* d_indices,
+                                       size_type const* d_offsets,
+                                       size_type unique_count,
+                                       bool* working_memory,
+                                       cudf::device_span<bool*> d_results)
 {
-  auto const num_targets = d_targets.size();
-  auto const idx         = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx     = idx / cudf::detail::warp_size;
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
-  // get the string for this warp
+
+  // get the string for this tile
   auto const d_str = d_strings.element<string_view>(str_idx);
 
-  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const lane_idx    = idx % tile_size;
+  auto const tile_idx    = static_cast<cudf::thread_index_type>(threadIdx.x) / tile_size;
+  auto const num_targets = d_targets.size();
 
   // size of shared_bools = num_targets * block_size
   // each thread uses num_targets bools
   extern __shared__ bool shared_bools[];
-  auto const warp_idx = threadIdx.x / cudf::detail::warp_size;
   // bools for the current string
-  auto bools = working_memory == nullptr
-                 ? (shared_bools + (warp_idx * cudf::detail::warp_size * num_targets))
-                 : (working_memory + (str_idx * cudf::detail::warp_size * num_targets));
+  auto bools = working_memory == nullptr ? (shared_bools + (tile_idx * tile_size * num_targets))
+                                         : (working_memory + (str_idx * tile_size * num_targets));
 
   // initialize result: set true if target is empty, false otherwise
-  for (auto target_idx = lane_idx; target_idx < num_targets;
-       target_idx += cudf::detail::warp_size) {
+  for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
     auto const d_target = d_targets.element<string_view>(target_idx);
-    auto const begin    = bools + (target_idx * cudf::detail::warp_size);
-    thrust::uninitialized_fill(
-      thrust::seq, begin, begin + cudf::detail::warp_size, d_target.empty());
+    auto const begin    = bools + (target_idx * tile_size);
+    thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
   }
+  if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); }
 
   auto const last_ptr = d_first_bytes + unique_count;
   for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes();
-       str_byte_idx += cudf::detail::warp_size) {
+       str_byte_idx += tile_size) {
     // search for byte in first_bytes array
-    auto const chr      = static_cast<u_char>(*(d_str.data() + str_byte_idx));
+    auto const sptr     = d_str.data() + str_byte_idx;
+    auto const chr      = static_cast<u_char>(*sptr);
     auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
     // if not found, continue to next byte
     if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
@@ -109,84 +148,28 @@ CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings
     // check for targets that begin with chr
     while (map_idx < last_idx) {
       auto const target_idx = d_indices[map_idx++];
-      auto const bool_idx   = (target_idx * cudf::detail::warp_size) + lane_idx;
+      auto const bool_idx   = (target_idx * tile_size) + lane_idx;
       if (!bools[bool_idx]) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
         if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
-          // first char already checked, only need to check the [2nd, end) chars if has.
-          bool found = true;
-          for (auto i = 1; i < d_target.size_bytes() && found; i++) {
-            if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; }
+          // first char already checked, so just check the [1, end) chars match
+          auto const tp = d_target.data();
+          if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) {
+            bools[bool_idx] = true;
           }
-          if (found) { bools[bool_idx] = true; }
         }
       }
     }
   }
-
-  // wait all lanes are done in a warp
-  __syncwarp();
+  if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); }
 
   // reduce the bools for each target to store in the result
-  for (auto target_idx = lane_idx; target_idx < num_targets;
-       target_idx += cudf::detail::warp_size) {
-    auto begin = bools + (target_idx * cudf::detail::warp_size);
-    auto found =
-      thrust::any_of(thrust::seq, begin, begin + cudf::detail::warp_size, thrust::identity<bool>{});
-    d_results[target_idx][str_idx] = found;
-  }
-}
-
-CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings,
-                                             column_device_view const d_targets,
-                                             u_char const* d_first_bytes,
-                                             size_type const* d_indices,
-                                             size_type const* d_offsets,
-                                             size_type unique_count,
-                                             cudf::device_span<bool*> d_results)
-{
-  auto const str_idx     = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
-  auto const num_targets = d_targets.size();
-  if (str_idx >= d_strings.size()) { return; }
-  if (d_strings.is_null(str_idx)) { return; }
-  auto const d_str = d_strings.element<string_view>(str_idx);
-
-  // initialize output; the result of searching empty target is true
-  for (auto target_idx = 0; target_idx < num_targets; ++target_idx) {
-    auto const d_target            = d_targets.element<string_view>(target_idx);
-    d_results[target_idx][str_idx] = d_target.empty();
-  }
-
-  // process each byte of the current string
-  auto const last_ptr = d_first_bytes + unique_count;
-  for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) {
-    // search for byte in first_bytes array
-    auto const chr      = static_cast<u_char>(*(d_str.data() + str_byte_idx));
-    auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
-    // if not found, continue to next byte
-    if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
-    // compute index of matched byte
-    auto const offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
-    auto map_idx          = d_offsets[offset_idx];
-    auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
-    // check for targets that begin with chr
-    while (map_idx < last_idx) {
-      auto const target_idx = d_indices[map_idx++];
-      if (!d_results[target_idx][str_idx]) {  // not found before
-        auto const d_target = d_targets.element<string_view>(target_idx);
-        if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
-          // first char already checked, only need to check the [2nd, end) chars
-          bool found = true;
-          for (auto i = 1; i < d_target.size_bytes() && found; i++) {
-            if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; }
-          }
-          if (found) { d_results[target_idx][str_idx] = true; }
-        }
-      }
-    }
+  for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+    auto const begin = bools + (target_idx * tile_size);
+    d_results[target_idx][str_idx] =
+      thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
   }
 }
-
 }  // namespace
 
 std::unique_ptr<table> contains_multiple(strings_column_view const& input,
@@ -197,8 +180,6 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
   CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string.");
   CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null");
 
-  auto const num_targets = targets.size();
-
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_targets = column_device_view::create(targets.parent(), stream);
 
@@ -214,23 +195,25 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
     auto count_itr = thrust::make_counting_iterator<size_type>(0);
     auto keys_out  = first_bytes.begin();
     auto vals_out  = indices.begin();
+    auto num_items = targets.size();
     auto cmp_op    = thrust::less();
     auto sv        = stream.value();
 
     std::size_t tmp_bytes = 0;
     cub::DeviceMergeSort::SortPairsCopy(
-      nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv);
+      nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
     auto tmp_stg = rmm::device_buffer(tmp_bytes, stream);
     cub::DeviceMergeSort::SortPairsCopy(
-      tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv);
+      tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
   }
 
-  // remove duplicates to speed up lower_bound
+  // remove duplicates to help speed up lower_bound
   auto offsets = rmm::device_uvector<size_type>(targets.size(), stream);
   thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end());
-  auto end = thrust::unique_by_key(
+  auto const end = thrust::unique_by_key(
     rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin());
-  auto ucount = static_cast<size_type>(thrust::distance(first_bytes.begin(), end.first));
+  auto const unique_count =
+    static_cast<size_type>(thrust::distance(first_bytes.begin(), end.first));
 
   // create output columns
   auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
@@ -241,56 +224,65 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
                                stream,
                                mr);
   });
-  auto results_list =
-    std::vector<std::unique_ptr<column>>(results_iter, results_iter + targets.size());
-  auto device_results_list = [&] {
+  auto results = std::vector<std::unique_ptr<column>>(results_iter, results_iter + targets.size());
+  auto d_results = [&] {
     auto host_results_pointer_iter =
-      thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) {
+      thrust::make_transform_iterator(results.begin(), [](auto const& results_column) {
         return results_column->mutable_view().template data<bool>();
       });
-    auto host_results_pointers = std::vector<bool*>(
-      host_results_pointer_iter, host_results_pointer_iter + results_list.size());
+    auto host_results_pointers =
+      std::vector<bool*>(host_results_pointer_iter, host_results_pointer_iter + results.size());
     return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
   }();
 
   constexpr cudf::thread_index_type block_size = 256;
+  constexpr size_type targets_threshold        = 16;  // for shared-memory size
 
   auto d_first_bytes = first_bytes.data();
   auto d_indices     = indices.data();
   auto d_offsets     = offsets.data();
 
-  // Smaller strings perform better with a row per string
+  auto const shared_mem_size =
+    targets.size() < targets_threshold ? (block_size * targets.size()) : 0;
+  auto const work_mem_size =
+    targets.size() < targets_threshold
+      ? 0
+      : (static_cast<std::size_t>(targets.size()) * input.size() * cudf::detail::warp_size);
+  auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
+
   bool const row_parallel = ((input.null_count() == input.size()) ||
                              ((input.chars_size(stream) / (input.size() - input.null_count())) <=
                               AVG_CHAR_BYTES_THRESHOLD));
   if (row_parallel) {
+    // Smaller strings perform better with a row per string
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
-    multi_contains_row_parallel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list);
+    multi_contains_kernel<1>
+      <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
+        *d_strings,
+        *d_targets,
+        d_first_bytes,
+        d_indices,
+        d_offsets,
+        unique_count,
+        working_memory.data(),
+        d_results);
   } else {
+    // Longer strings perform better with a warp per string
     cudf::detail::grid_1d grid{
       static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-    auto shared_mem_size    = block_size * targets.size();
-    size_type work_mem_size = 0;
-    if (shared_mem_size > (16 * block_size)) {  // TODO: Need to find a good value for this
-      shared_mem_size = 0;
-      work_mem_size   = targets.size() * input.size() * cudf::detail::warp_size;
-    }
-    auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
-    multi_contains_warp_parallel<<<grid.num_blocks,
-                                   grid.num_threads_per_block,
-                                   shared_mem_size,
-                                   stream.value()>>>(*d_strings,
-                                                     *d_targets,
-                                                     d_first_bytes,
-                                                     d_indices,
-                                                     d_offsets,
-                                                     ucount,
-                                                     working_memory.data(),
-                                                     device_results_list);
+    multi_contains_kernel<cudf::detail::warp_size>
+      <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
+        *d_strings,
+        *d_targets,
+        d_first_bytes,
+        d_indices,
+        d_offsets,
+        unique_count,
+        working_memory.data(),
+        d_results);
   }
 
-  return std::make_unique<table>(std::move(results_list));
+  return std::make_unique<table>(std::move(results));
 }
 
 }  // namespace detail

From 33d784746246a7d0262bed37e53af829bcd0c3a7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 30 Sep 2024 14:06:54 -0400
Subject: [PATCH 10/23] move tests to find_multiple_tests.cpp

---
 cpp/benchmarks/string/find_multiple.cpp     |   3 +-
 cpp/src/strings/search/contains_multiple.cu |   2 +-
 cpp/tests/strings/find_multiple_tests.cpp   | 149 ++++++++++++++++++++
 cpp/tests/strings/find_tests.cpp            | 149 --------------------
 4 files changed, 151 insertions(+), 152 deletions(-)

diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
index 8d47a4b03db..f3115677fa4 100644
--- a/cpp/benchmarks/string/find_multiple.cpp
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -59,7 +59,6 @@ static void bench_find_string(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
-  state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   if (api == "find") {
     state.add_global_memory_writes<nvbench::int32_t>(input.size());
@@ -81,7 +80,7 @@ static void bench_find_string(nvbench::state& state)
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_multiple")
   .add_string_axis("api", {"find", "contains"})
-  .add_int64_axis("targets", {10, 20})
+  .add_int64_axis("targets", {10, 20, 40})
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
   .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
   .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index e72cde6ec04..7c63fbf0453 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -236,7 +236,7 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
   }();
 
   constexpr cudf::thread_index_type block_size = 256;
-  constexpr size_type targets_threshold        = 16;  // for shared-memory size
+  constexpr size_type targets_threshold        = 32;  // for shared-memory size
 
   auto d_first_bytes = first_bytes.data();
   auto d_indices     = indices.data();
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 41a5940c880..af6c5800f4d 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -80,3 +81,151 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   // targets cannot have nulls
   EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error);
 }
+
+TEST_F(StringsFindMultipleTest, MultiContains)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 9 rows:
+  std::vector<std::string> s = {
+    "Héllo, there world and goodbye",
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
+    "the following code snippet demonstrates how to use search for values in an ordered range",
+    "it returns the last position where value could be inserted without violating the ordering",
+    "algorithms execution is parallelized as determined by an execution policy. t",
+    "he this is a continuation of previous row to make sure string boundaries are honored",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
+    "",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> match_targets({" the ", "a", "", "é"});
+  cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                          match_targets.end());
+  auto results =
+    cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column));
+
+  std::vector<bool> ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+
+  auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets)
+{
+  auto const strings = cudf::test::strings_column_wrapper{
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position"};
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> targets({"lazy brown", "non-exist", ""});
+
+  std::vector<cudf::test::fixed_width_column_wrapper<bool>> expects;
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1}));
+
+  std::vector<std::string> match_targets;
+  int max_num_targets = 50;
+
+  for (int num_targets = 1; num_targets < max_num_targets; num_targets++) {
+    match_targets.clear();
+    for (int i = 0; i < num_targets; i++) {
+      match_targets.push_back(targets[i % targets.size()]);
+    }
+
+    cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                            match_targets.end());
+    auto results = cudf::strings::contains_multiple(
+      strings_view, cudf::strings_column_view(multi_targets_column));
+    EXPECT_EQ(results->num_columns(), num_targets);
+    for (int i = 0; i < num_targets; i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]);
+    }
+  }
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsLongStrings)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 7 rows:
+  std::vector<std::string> s = {
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position",
+    "algorithms execution is parallelized as determined by an execution policy. t algorithms "
+    "execution is parallelized as ",
+    "he this is a continuation of previous row to make sure string boundaries are honored he this "
+    "is a continuation of previous row",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ "
+    "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+
+  auto sv      = cudf::strings_column_view(strings);
+  auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "});
+  auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets));
+
+  std::vector<bool> ret_0 = {1, 0, 1, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {0, 0, 0, 0, 1, 0, 0};
+  std::vector<bool> ret_4 = {1, 0, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_5 = {0, 0, 1, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+  auto expected_4 = make_bool_col_fn(ret_4);
+  auto expected_5 = make_bool_col_fn(ret_5);
+
+  auto expected =
+    cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 19714a5afd0..6b2163aeda1 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -198,154 +197,6 @@ TEST_F(StringsFindTest, ContainsLongStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
-TEST_F(StringsFindTest, MultiContains)
-{
-  constexpr int num_rows = 1024 + 1;
-  // replicate the following 9 rows:
-  std::vector<std::string> s = {
-    "Héllo, there world and goodbye",
-    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
-    "the following code snippet demonstrates how to use search for values in an ordered range",
-    "it returns the last position where value could be inserted without violating the ordering",
-    "algorithms execution is parallelized as determined by an execution policy. t",
-    "he this is a continuation of previous row to make sure string boundaries are honored",
-    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
-    "",
-    ""};
-
-  // replicate strings
-  auto string_itr =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
-
-  // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ......
-  auto string_v = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) { return (i + 1) % s.size() != 0; });
-
-  auto const strings =
-    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
-  auto strings_view = cudf::strings_column_view(strings);
-  std::vector<std::string> match_targets({" the ", "a", "", "é"});
-  cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
-                                                          match_targets.end());
-  auto results =
-    cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column));
-
-  std::vector<bool> ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0};
-  std::vector<bool> ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0};
-  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0};
-  std::vector<bool> ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0};
-
-  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0, [&](auto i) { return bools[i % bools.size()]; });
-    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
-  };
-
-  auto expected_0 = make_bool_col_fn(ret_0);
-  auto expected_1 = make_bool_col_fn(ret_1);
-  auto expected_2 = make_bool_col_fn(ret_2);
-  auto expected_3 = make_bool_col_fn(ret_3);
-
-  auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
-}
-
-TEST_F(StringsFindTest, MultiContainsMoreTargets)
-{
-  auto const strings = cudf::test::strings_column_wrapper{
-    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
-    "quick brown fox jumped",
-    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
-    "following code snippet",
-    "thé it returns the last position where value could be inserted without violating ordering thé "
-    "it returns the last position"};
-  auto strings_view = cudf::strings_column_view(strings);
-  std::vector<std::string> targets({"lazy brown", "non-exist", ""});
-
-  std::vector<cudf::test::fixed_width_column_wrapper<bool>> expects;
-  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0}));
-  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0}));
-  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1}));
-
-  std::vector<std::string> match_targets;
-  int max_num_targets = 50;
-
-  for (int num_targets = 1; num_targets < max_num_targets; num_targets++) {
-    match_targets.clear();
-    for (int i = 0; i < num_targets; i++) {
-      match_targets.push_back(targets[i % targets.size()]);
-    }
-
-    cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
-                                                            match_targets.end());
-    auto results = cudf::strings::contains_multiple(
-      strings_view, cudf::strings_column_view(multi_targets_column));
-    EXPECT_EQ(results->num_columns(), num_targets);
-    for (int i = 0; i < num_targets; i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]);
-    }
-  }
-}
-
-TEST_F(StringsFindTest, MultiContainsLongStrings)
-{
-  constexpr int num_rows = 1024 + 1;
-  // replicate the following 7 rows:
-  std::vector<std::string> s = {
-    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
-    "quick brown fox jumped",
-    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
-    "following code snippet",
-    "thé it returns the last position where value could be inserted without violating ordering thé "
-    "it returns the last position",
-    "algorithms execution is parallelized as determined by an execution policy. t algorithms "
-    "execution is parallelized as ",
-    "he this is a continuation of previous row to make sure string boundaries are honored he this "
-    "is a continuation of previous row",
-    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ "
-    "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL",
-    ""};
-
-  // replicate strings
-  auto string_itr =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
-
-  // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ......
-  auto string_v = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) { return (i + 1) % s.size() != 0; });
-
-  auto const strings =
-    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
-
-  auto sv      = cudf::strings_column_view(strings);
-  auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "});
-  auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets));
-
-  std::vector<bool> ret_0 = {1, 0, 1, 0, 0, 0, 0};
-  std::vector<bool> ret_1 = {0, 1, 0, 0, 0, 0, 0};
-  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 0};
-  std::vector<bool> ret_3 = {0, 0, 0, 0, 1, 0, 0};
-  std::vector<bool> ret_4 = {1, 0, 0, 0, 0, 0, 0};
-  std::vector<bool> ret_5 = {0, 0, 1, 0, 0, 0, 0};
-
-  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0, [&](auto i) { return bools[i % bools.size()]; });
-    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
-  };
-
-  auto expected_0 = make_bool_col_fn(ret_0);
-  auto expected_1 = make_bool_col_fn(ret_1);
-  auto expected_2 = make_bool_col_fn(ret_2);
-  auto expected_3 = make_bool_col_fn(ret_3);
-  auto expected_4 = make_bool_col_fn(ret_4);
-  auto expected_5 = make_bool_col_fn(ret_5);
-
-  auto expected =
-    cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
-}
-
 TEST_F(StringsFindTest, StartsWith)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},

From 04717e3002f1faca071f82b683a77ceca2c6ff1e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Oct 2024 13:30:29 -0400
Subject: [PATCH 11/23] use output directly in row-parallel kernel

---
 cpp/src/strings/search/contains_multiple.cu | 60 ++++++++++++---------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 7c63fbf0453..8acef1170ce 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -127,8 +127,12 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
   // initialize result: set true if target is empty, false otherwise
   for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
     auto const d_target = d_targets.element<string_view>(target_idx);
-    auto const begin    = bools + (target_idx * tile_size);
-    thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
+    if constexpr (tile_size == 1) {
+      d_results[target_idx][str_idx] = d_target.empty();
+    } else {
+      auto const begin = bools + (target_idx * tile_size);
+      thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
+    }
   }
   if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); }
 
@@ -149,25 +153,31 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
     while (map_idx < last_idx) {
       auto const target_idx = d_indices[map_idx++];
       auto const bool_idx   = (target_idx * tile_size) + lane_idx;
-      if (!bools[bool_idx]) {  // not found before
+      auto const found      = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx];
+      if (!found) {  // not found before
         auto const d_target = d_targets.element<string_view>(target_idx);
         if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
           // first char already checked, so just check the [1, end) chars match
           auto const tp = d_target.data();
           if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) {
-            bools[bool_idx] = true;
+            if constexpr (tile_size == 1) {
+              d_results[target_idx][str_idx] = true;
+            } else {
+              bools[bool_idx] = true;
+            }
           }
         }
       }
     }
   }
-  if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); }
-
-  // reduce the bools for each target to store in the result
-  for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
-    auto const begin = bools + (target_idx * tile_size);
-    d_results[target_idx][str_idx] =
-      thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
+  if constexpr (tile_size == cudf::detail::warp_size) {
+    __syncwarp();
+    // reduce the bools for each target to store in the result
+    for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+      auto const begin = bools + (target_idx * tile_size);
+      d_results[target_idx][str_idx] =
+        thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
+    }
   }
 }
 }  // namespace
@@ -242,30 +252,30 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
   auto d_indices     = indices.data();
   auto d_offsets     = offsets.data();
 
+  bool const row_parallel = ((input.null_count() == input.size()) ||
+                             ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+                              AVG_CHAR_BYTES_THRESHOLD));
+
   auto const shared_mem_size =
-    targets.size() < targets_threshold ? (block_size * targets.size()) : 0;
+    !row_parallel && (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0;
   auto const work_mem_size =
-    targets.size() < targets_threshold
+    row_parallel || (targets.size() < targets_threshold)
       ? 0
       : (static_cast<std::size_t>(targets.size()) * input.size() * cudf::detail::warp_size);
   auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
 
-  bool const row_parallel = ((input.null_count() == input.size()) ||
-                             ((input.chars_size(stream) / (input.size() - input.null_count())) <=
-                              AVG_CHAR_BYTES_THRESHOLD));
   if (row_parallel) {
     // Smaller strings perform better with a row per string
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
     multi_contains_kernel<1>
-      <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
-        *d_strings,
-        *d_targets,
-        d_first_bytes,
-        d_indices,
-        d_offsets,
-        unique_count,
-        working_memory.data(),
-        d_results);
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                           *d_targets,
+                                                                           d_first_bytes,
+                                                                           d_indices,
+                                                                           d_offsets,
+                                                                           unique_count,
+                                                                           working_memory.data(),
+                                                                           d_results);
   } else {
     // Longer strings perform better with a warp per string
     cudf::detail::grid_1d grid{

From 62a51c69833b70b5f87a76e3653f0574aca506f9 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Oct 2024 08:54:17 -0400
Subject: [PATCH 12/23] Update doxygen, exceptions

Co-authored-by: Chong Gao <res_life@163.com>
---
 cpp/include/cudf/strings/find_multiple.hpp  | 13 +++++++------
 cpp/src/strings/search/contains_multiple.cu |  5 +++--
 cpp/src/strings/search/find_multiple.cu     |  5 +++--
 cpp/tests/strings/find_multiple_tests.cpp   |  6 ++++--
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 90ffd6e7df9..2173fc97526 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -28,8 +28,7 @@ namespace strings {
  */
 
 /**
- * @brief Returns a table of columns of boolean values for each string where true indicates
- * the target string was found within that string in the provided column
+ * @brief Searches for the given target strings within each string in the provided column
  *
  * Each column in the result table corresponds to the result for the target string at the same
  * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
@@ -38,7 +37,7 @@ namespace strings {
  * If the target is not found for a string, false is returned for that entry in the output column.
  * If the target is an empty string, true is returned for all non-null entries in the output column.
  *
- * Any null string entries return corresponding null entries in the output columns.
+ * Any null input strings return corresponding null entries in the output columns.
  *
  * @code{.pseudo}
  * input = ["a", "b", "c"]
@@ -48,6 +47,8 @@ namespace strings {
  *   column 1: [false, false, true]
  * @endcode
  *
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
+ *
  * @param input Strings instance for this operation
  * @param targets UTF-8 encoded strings to search for in each string in `input`
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -61,8 +62,8 @@ std::unique_ptr<table> contains_multiple(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns a lists column with character position values where each
- * of the target strings are found in each string.
+ * @brief Searches for the given target strings within each string in the provided column
+ * and returns the position the targets were found
  *
  * The size of the output column is `input.size()`.
  * Each row of the output column is of size `targets.size()`.
@@ -78,7 +79,7 @@ std::unique_ptr<table> contains_multiple(
  *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw cudf::logic_error if `targets` is empty or contains nulls
+ * @throw , std::invalid_argument if `targets` is empty or contains nulls
  *
  * @param input Strings instance for this operation
  * @param targets Strings to search for in each string
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 8acef1170ce..ee5519a87fa 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -187,8 +187,9 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string.");
-  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null");
+  CUDF_EXPECTS(
+    not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument);
+  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_targets = column_device_view::create(targets.parent(), stream);
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index ec7015878dd..67226b259d4 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -42,8 +42,9 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
-  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
-  CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
+  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument);
+  CUDF_EXPECTS(
+    !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument);
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index af6c5800f4d..3c8483b153d 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -76,10 +76,12 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto empty_view                     = cudf::strings_column_view(zero_size_strings_column);
   // targets must have at least one string
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument);
 
   // targets cannot have nulls
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument);
 }
 
 TEST_F(StringsFindMultipleTest, MultiContains)

From ede8a7bdd56d84b42cc70c9984a0b85a841c0c72 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 17 Oct 2024 19:26:22 -0400
Subject: [PATCH 13/23] cleanup code part I

---
 cpp/include/cudf/strings/find_multiple.hpp  |  2 +-
 cpp/src/strings/search/contains_multiple.cu | 34 ++++++++++++---------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 2173fc97526..6b9cc3723b3 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -53,7 +53,7 @@ namespace strings {
  * @param targets UTF-8 encoded strings to search for in each string in `input`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New BOOL8 column
+ * @return Table of BOOL8 columns
  */
 std::unique_ptr<table> contains_multiple(
   strings_column_view const& input,
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index ee5519a87fa..cc91719915b 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -170,13 +170,15 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
       }
     }
   }
-  if constexpr (tile_size == cudf::detail::warp_size) {
+
+  if constexpr (tile_size > 1) {
     __syncwarp();
     // reduce the bools for each target to store in the result
     for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
       auto const begin = bools + (target_idx * tile_size);
       d_results[target_idx][str_idx] =
         thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
+      // cooperative group implementation was almost 3x slower than this parallel reduce
     }
   }
 }
@@ -247,7 +249,8 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
   }();
 
   constexpr cudf::thread_index_type block_size = 256;
-  constexpr size_type targets_threshold        = 32;  // for shared-memory size
+  // calculated (benchmarked) for efficient use of shared-memory
+  constexpr size_type targets_threshold = 32;
 
   auto d_first_bytes = first_bytes.data();
   auto d_indices     = indices.data();
@@ -257,14 +260,6 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
                              ((input.chars_size(stream) / (input.size() - input.null_count())) <=
                               AVG_CHAR_BYTES_THRESHOLD));
 
-  auto const shared_mem_size =
-    !row_parallel && (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0;
-  auto const work_mem_size =
-    row_parallel || (targets.size() < targets_threshold)
-      ? 0
-      : (static_cast<std::size_t>(targets.size()) * input.size() * cudf::detail::warp_size);
-  auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
-
   if (row_parallel) {
     // Smaller strings perform better with a row per string
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
@@ -275,13 +270,22 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
                                                                            d_indices,
                                                                            d_offsets,
                                                                            unique_count,
-                                                                           working_memory.data(),
+                                                                           nullptr,
                                                                            d_results);
   } else {
-    // Longer strings perform better with a warp per string
-    cudf::detail::grid_1d grid{
-      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-    multi_contains_kernel<cudf::detail::warp_size>
+    constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size;
+
+    auto const shared_mem_size =
+      (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0;
+    auto const work_mem_size =
+      (targets.size() < targets_threshold) ? 0 : tile_size * targets.size() * input.size();
+    auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
+
+    // std::cout << shared_mem_size << "," << work_mem_size << std::endl;
+
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                               block_size};
+    multi_contains_kernel<tile_size>
       <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
         *d_strings,
         *d_targets,

From b5493ec331be0a56487fd75ea68dec1aa7f14527 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 22 Oct 2024 10:24:13 -0400
Subject: [PATCH 14/23] fix threshold check

---
 cpp/src/strings/search/contains_multiple.cu | 4 ++--
 cpp/tests/strings/find_tests.cpp            | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index cc91719915b..190881c9f34 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -81,8 +81,8 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
  * The logic now only needs to check for either of these 2 targets.
  *
  * This kernel works in either row-per-string or warp-per-string depending
- * on the template parameter. If tile_size==1, then it executes as a
- * row-per-string. If tile_size=32, the it executes as a warp-per-string.
+ * on the template parameter. If tile_size==1, then this kernel executes as
+ * a row-per-string. If tile_size=32, the it executes as a warp-per-string.
  * No other options are supported for now.
  *
  * @tparam tile_size Number of threads per string
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 6b2163aeda1..a3066c40650 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -23,7 +23,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/find.hpp>
-#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>

From b6ac3ff983de8b0efa517ab3bde5df7f6e0fdc5c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 22 Oct 2024 12:49:56 -0400
Subject: [PATCH 15/23] fix threshold check for real

---
 cpp/src/strings/search/contains_multiple.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 190881c9f34..ff995c53413 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -276,9 +276,9 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
     constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size;
 
     auto const shared_mem_size =
-      (targets.size() < targets_threshold) ? (block_size * targets.size()) : 0;
+      (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0;
     auto const work_mem_size =
-      (targets.size() < targets_threshold) ? 0 : tile_size * targets.size() * input.size();
+      (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size();
     auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
 
     // std::cout << shared_mem_size << "," << work_mem_size << std::endl;

From b038e0c13cdc63711d8d802b65c6f6e853a5324b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 23 Oct 2024 15:14:31 -0400
Subject: [PATCH 16/23] remove commented out debug print

---
 cpp/include/cudf/strings/find_multiple.hpp  | 2 +-
 cpp/src/strings/search/contains_multiple.cu | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 6b9cc3723b3..03a74166256 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -79,7 +79,7 @@ std::unique_ptr<table> contains_multiple(
  *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw , std::invalid_argument if `targets` is empty or contains nulls
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
  *
  * @param input Strings instance for this operation
  * @param targets Strings to search for in each string
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index ff995c53413..e4468f4df9a 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -281,8 +281,6 @@ std::unique_ptr<table> contains_multiple(strings_column_view const& input,
       (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size();
     auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
 
-    // std::cout << shared_mem_size << "," << work_mem_size << std::endl;
-
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
                                block_size};
     multi_contains_kernel<tile_size>

From 3c5cd47c92d01dbc252d856741d1592c8ac1ab9f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 31 Oct 2024 10:15:37 -0400
Subject: [PATCH 17/23] fix call to build benchmark input col

---
 cpp/benchmarks/string/find_multiple.cpp | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
index f3115677fa4..514c6a115c3 100644
--- a/cpp/benchmarks/string/find_multiple.cpp
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -26,10 +26,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate);
-
 static void bench_find_string(nvbench::state& state)
 {
   auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -38,16 +34,11 @@ static void bench_find_string(nvbench::state& state)
   auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
   auto const api          = state.get_string("api");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto const stream = cudf::get_default_stream();
-  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  // Note that these all match the first row of the raw_data in build_input_column.
+  // Note that these all match the first row of the raw_data in create_string_column.
   // This is so the hit_rate can properly accounted for.
   std::vector<std::string> target_data(
     {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
@@ -81,6 +72,6 @@ NVBENCH_BENCH(bench_find_string)
   .set_name("find_multiple")
   .add_string_axis("api", {"find", "contains"})
   .add_int64_axis("targets", {10, 20, 40})
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {20, 80});  // percentage

From 9910bd35154864c1242d53c17a0855b43be7004f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 5 Nov 2024 10:23:56 -0500
Subject: [PATCH 18/23] fix copyright year

---
 cpp/benchmarks/string/find_multiple.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
index 514c6a115c3..0e780fdb302 100644
--- a/cpp/benchmarks/string/find_multiple.cpp
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_find_string(nvbench::state& state)
 
   // Note that these all match the first row of the raw_data in create_string_column.
   // This is so the hit_rate can properly accounted for.
-  std::vector<std::string> target_data(
+  std::vector<std::string> const target_data(
     {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
   auto h_targets = std::vector<std::string>{};
   for (cudf::size_type i = 0; i < target_count; i++) {

From 31d5a4e13daa7a434396114d7dd3ba3f5da60880 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 5 Nov 2024 13:11:36 -0500
Subject: [PATCH 19/23] use cooperative groups for tile-size

---
 cpp/src/strings/search/contains_multiple.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index e4468f4df9a..5f01f52e60d 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cub/cub.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -113,7 +114,9 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
   // get the string for this tile
   auto const d_str = d_strings.element<string_view>(str_idx);
 
-  auto const lane_idx    = idx % tile_size;
+  namespace cg           = cooperative_groups;
+  auto const tile        = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto const lane_idx    = tile.thread_rank();
   auto const tile_idx    = static_cast<cudf::thread_index_type>(threadIdx.x) / tile_size;
   auto const num_targets = d_targets.size();
 
@@ -134,7 +137,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
       thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
     }
   }
-  if constexpr (tile_size == cudf::detail::warp_size) { __syncwarp(); }
+  tile.sync();
 
   auto const last_ptr = d_first_bytes + unique_count;
   for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes();
@@ -178,7 +181,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
       auto const begin = bools + (target_idx * tile_size);
       d_results[target_idx][str_idx] =
         thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
-      // cooperative group implementation was almost 3x slower than this parallel reduce
+      // cooperative_group any() implementation was almost 3x slower than this parallel reduce
     }
   }
 }

From 3b5576bac048d65b063fc7f14e7f27f14e365cb7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 5 Nov 2024 13:37:49 -0500
Subject: [PATCH 20/23] replace syncwarp with tile.sync

---
 cpp/src/strings/search/contains_multiple.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 5f01f52e60d..66a02a14909 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -175,7 +175,7 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
   }
 
   if constexpr (tile_size > 1) {
-    __syncwarp();
+    tile.sync();
     // reduce the bools for each target to store in the result
     for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
       auto const begin = bools + (target_idx * tile_size);

From c0e96dc25122e93b0ea28b0106c9441701f59969 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 5 Nov 2024 13:50:40 -0500
Subject: [PATCH 21/23] use meta_group_rank

---
 cpp/src/strings/search/contains_multiple.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index 66a02a14909..ceebdc5b8fa 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -117,15 +117,15 @@ CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
   namespace cg           = cooperative_groups;
   auto const tile        = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto const lane_idx    = tile.thread_rank();
-  auto const tile_idx    = static_cast<cudf::thread_index_type>(threadIdx.x) / tile_size;
   auto const num_targets = d_targets.size();
 
   // size of shared_bools = num_targets * block_size
   // each thread uses num_targets bools
   extern __shared__ bool shared_bools[];
   // bools for the current string
-  auto bools = working_memory == nullptr ? (shared_bools + (tile_idx * tile_size * num_targets))
-                                         : (working_memory + (str_idx * tile_size * num_targets));
+  auto bools = working_memory == nullptr
+                 ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets))
+                 : (working_memory + (str_idx * tile_size * num_targets));
 
   // initialize result: set true if target is empty, false otherwise
   for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {

From 7031dac62f7b088e4a4cb84b1be5ec6d329707b8 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 8 Nov 2024 14:36:08 -0500
Subject: [PATCH 22/23] fix comment wording

---
 cpp/src/strings/search/contains_multiple.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
index ceebdc5b8fa..1183e3e4038 100644
--- a/cpp/src/strings/search/contains_multiple.cu
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -81,7 +81,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
  * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello'
  * The logic now only needs to check for either of these 2 targets.
  *
- * This kernel works in either row-per-string or warp-per-string depending
+ * This kernel works in either thread-per-string or warp-per-string depending
  * on the template parameter. If tile_size==1, then this kernel executes as
  * a row-per-string. If tile_size=32, the it executes as a warp-per-string.
  * No other options are supported for now.

From 4ef6d46a69f4d3ab8e88253b2ba4e75a423d0ea6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 12 Nov 2024 09:19:52 -0600
Subject: [PATCH 23/23] 1st instead of 1th

---
 cpp/include/cudf/strings/find_multiple.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 03a74166256..e090766dd07 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -31,7 +31,7 @@ namespace strings {
  * @brief Searches for the given target strings within each string in the provided column
  *
  * Each column in the result table corresponds to the result for the target string at the same
- * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
+ * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st,
  * etc.
  *
  * If the target is not found for a string, false is returned for that entry in the output column.