From 59a46fe1c4f878a3fc67002f9e79609f6e0e61ba Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 7 Sep 2023 16:32:52 -0400 Subject: [PATCH 1/5] Expose stream parameter in public strings find APIs --- cpp/include/cudf/strings/find.hpp | 100 ++++++++++++--------- cpp/include/cudf/strings/find_multiple.hpp | 12 +-- cpp/include/cudf/strings/findall.hpp | 2 + cpp/src/strings/search/find.cu | 24 +++-- cpp/src/strings/search/find_multiple.cu | 3 +- cpp/src/strings/search/findall.cu | 3 +- 6 files changed, 87 insertions(+), 57 deletions(-) diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 2fed36862b9..2b63dfe4935 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -43,19 +43,21 @@ namespace strings { * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First character position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First character position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search to the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr find( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -72,19 +74,21 @@ std::unique_ptr find( * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search starting at the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr rfind( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -123,37 +127,41 @@ std::unique_ptr find( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a column of boolean values for each string where true indicates * the corresponding target string was found within that string in the provided column. * - * The 'output[i] = true` if string `targets[i]` is found inside `strings[i]` otherwise + * The 'output[i] = true` if string `targets[i]` is found inside `input[i]` otherwise * `output[i] = false`. * If `target[i]` is an empty string, true is returned for `output[i]`. * If `target[i]` is null, false is returned for `output[i]`. * - * Any null `strings[i]` row results in a null `output[i]` row. + * Any null string entries return corresponding null entries in the output columns. * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings column of targets to check row-wise in `strings`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings column of targets to check row-wise in `strings` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -166,14 +174,16 @@ std::unique_ptr contains( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New type_id::BOOL8 column. */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -190,14 +200,16 @@ std::unique_ptr starts_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -210,14 +222,16 @@ std::unique_ptr starts_with( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -234,14 +248,16 @@ std::unique_ptr ends_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 21cfdb15146..06b851c5012 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,14 +48,16 @@ namespace strings { * * @throw cudf::logic_error if `targets` is empty or contains nulls * - * @param input Strings instance for this operation. - * @param targets Strings to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return Lists column with character position values. + * @param input Strings instance for this operation + * @param targets Strings to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Lists column with character position values */ std::unique_ptr find_multiple( strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 745f0fc19ff..379b9624dc6 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -57,12 +57,14 @@ struct regex_program; * * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New lists column of strings */ std::unique_ptr findall( strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 3de9dd34d83..1299e552565 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -305,20 +305,22 @@ std::unique_ptr find(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::find(strings, target, start, stop, stream, mr); } std::unique_ptr rfind(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::rfind(strings, target, start, stop, stream, mr); } std::unique_ptr find(strings_column_view const& input, @@ -618,50 +620,56 @@ std::unique_ptr ends_with(strings_column_view const& strings, std::unique_ptr contains(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, target, cudf::get_default_stream(), mr); + return detail::contains(strings, target, stream, mr); } std::unique_ptr contains(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, targets, cudf::get_default_stream(), mr); + return detail::contains(strings, targets, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, target, cudf::get_default_stream(), mr); + return detail::starts_with(strings, target, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, targets, cudf::get_default_stream(), mr); + return detail::starts_with(strings, targets, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, target, cudf::get_default_stream(), mr); + return detail::ends_with(strings, target, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, targets, cudf::get_default_stream(), mr); + return detail::ends_with(strings, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 4a823ad1dcb..65c9dc63755 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -88,10 +88,11 @@ std::unique_ptr find_multiple(strings_column_view const& input, // external API std::unique_ptr find_multiple(strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find_multiple(input, targets, cudf::get_default_stream(), mr); + return detail::find_multiple(input, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 2df64c6a0a7..acea4ff1c51 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -134,10 +134,11 @@ std::unique_ptr findall(strings_column_view const& input, std::unique_ptr findall(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall(input, prog, cudf::get_default_stream(), mr); + return detail::findall(input, prog, stream, mr); } } // namespace strings From 003bc87c6a46017626f0c56f929be6feb7b545f0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Sep 2023 17:48:04 -0400 Subject: [PATCH 2/5] add streams tests --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/strings/find_test.cpp | 49 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 cpp/tests/streams/strings/find_test.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a69dc9bf2f8..8d88a41b6ce 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_STRINGS_TEST streams/strings/find_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp new file mode 100644 index 00000000000..a17a3d701e6 --- /dev/null +++ b/cpp/tests/streams/strings/find_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsFindTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsFindTest, Find) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const target = cudf::string_scalar("é"); + cudf::strings::find(view, target, 0, -1, cudf::get_default_stream()); + cudf::strings::rfind(view, target, 0, -1, cudf::get_default_stream()); + cudf::strings::find(view, view, 0, cudf::get_default_stream()); + cudf::strings::find_multiple(view, view, cudf::get_default_stream()); + cudf::strings::contains(view, target, cudf::get_default_stream()); + cudf::strings::starts_with(view, target, cudf::get_default_stream()); + cudf::strings::starts_with(view, view, cudf::get_default_stream()); + cudf::strings::ends_with(view, view, cudf::get_default_stream()); + cudf::strings::ends_with(view, target, cudf::get_default_stream()); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::findall(view, *prog, cudf::get_default_stream()); +} From cbd5b8f4d2d56e2fa6297d1c0f934da9efa6b7c8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Sep 2023 19:46:29 -0400 Subject: [PATCH 3/5] fix numeric-scalar stream passing --- cpp/src/strings/search/find_multiple.cu | 4 ++-- cpp/tests/streams/strings/find_test.cpp | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 65c9dc63755..fcaec835f4d 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -70,8 +70,8 @@ std::unique_ptr find_multiple(strings_column_view const& input, results->set_null_count(0); auto offsets = cudf::detail::sequence(strings_count + 1, - numeric_scalar(0), - numeric_scalar(targets_count), + numeric_scalar(0, true, stream), + numeric_scalar(targets_count, true, stream), stream, mr); return make_lists_column(strings_count, diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp index a17a3d701e6..b734a1738cc 100644 --- a/cpp/tests/streams/strings/find_test.cpp +++ b/cpp/tests/streams/strings/find_test.cpp @@ -32,18 +32,18 @@ TEST_F(StringsFindTest, Find) auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); auto view = cudf::strings_column_view(input); - auto const target = cudf::string_scalar("é"); - cudf::strings::find(view, target, 0, -1, cudf::get_default_stream()); - cudf::strings::rfind(view, target, 0, -1, cudf::get_default_stream()); - cudf::strings::find(view, view, 0, cudf::get_default_stream()); - cudf::strings::find_multiple(view, view, cudf::get_default_stream()); - cudf::strings::contains(view, target, cudf::get_default_stream()); - cudf::strings::starts_with(view, target, cudf::get_default_stream()); - cudf::strings::starts_with(view, view, cudf::get_default_stream()); - cudf::strings::ends_with(view, view, cudf::get_default_stream()); - cudf::strings::ends_with(view, target, cudf::get_default_stream()); + auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::find(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::rfind(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::find(view, view, 0, cudf::test::get_default_stream()); + cudf::strings::find_multiple(view, view, cudf::test::get_default_stream()); + cudf::strings::contains(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, view, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, target, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, view, cudf::test::get_default_stream()); auto const pattern = std::string("[a-z]"); auto const prog = cudf::strings::regex_program::create(pattern); - cudf::strings::findall(view, *prog, cudf::get_default_stream()); + cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); } From 0c05d42c8c80697358859dbb81800fd812431273 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Sep 2023 09:12:13 -0400 Subject: [PATCH 4/5] fix style violation --- cpp/tests/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 218a3b482b0..6414962903e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,7 +627,10 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE testing) +ConfigureTest( + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE + testing +) # ################################################################################################## # Install tests #################################################################################### From 5e1e2d68ead5e9c145123b8b4317f9a5eb02f16e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Sep 2023 09:20:28 -0400 Subject: [PATCH 5/5] fix typo in doxygen --- cpp/include/cudf/strings/find.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 2b63dfe4935..c1aa8b294b3 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -177,7 +177,7 @@ std::unique_ptr contains( * @param input Strings instance for this operation * @param target UTF-8 encoded string to search for in each string * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param mr Device memory resource used to allocate the returned column's device memory * @return New type_id::BOOL8 column. */ std::unique_ptr starts_with(