Expose stream parameter in public nvtext ngram APIs

rapidsai · Sep 7, 2023 · 89b595a · 89b595a
1 parent b4da39c
commit 89b595a
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 22 deletions.
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
@@ -47,18 +47,20 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
+ * @param input Strings column to tokenize and produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
  * @param separator The string to use for separating ngram tokens.
  *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams               = 2,
   cudf::string_scalar const& separator = cudf::string_scalar{"_"},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,15 +81,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +117,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,21 +66,23 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
+ * @param input Strings column to tokenize and produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
  * @param separator The string to use for separating ngram tokens.
  *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams               = 2,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   cudf::string_scalar const& separator = cudf::string_scalar{"_"},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,

diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext