Skip to content

Commit

Permalink
Expose stream parameter in public nvtext ngram APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Sep 7, 2023
1 parent b4da39c commit 89b595a
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 22 deletions.
26 changes: 16 additions & 10 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,20 @@ namespace nvtext {
* @throw cudf::logic_error if `separator` is invalid
* @throw cudf::logic_error if there are not enough strings to generate any ngrams
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -79,15 +81,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -113,14 +117,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
12 changes: 7 additions & 5 deletions cpp/include/nvtext/ngrams_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,21 +66,23 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> ngrams_tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
}

namespace detail {
Expand Down Expand Up @@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::generate_character_ngrams(strings, ngrams, stream, mr);
}

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
*
* This is called with a warp per row
*/
struct sorted_interset_fn {
struct sorted_intersect_fn {
cudf::column_device_view const d_input1;
cudf::column_device_view const d_input2;
cudf::size_type* d_results;
Expand Down Expand Up @@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
auto const d_input1 = cudf::column_device_view::create(input1, stream);
auto const d_input2 = cudf::column_device_view::create(input2, stream);
auto d_results = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
input1.size() * cudf::detail::warp_size,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::ngrams_tokenize(
strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
}

} // namespace nvtext

0 comments on commit 89b595a

Please sign in to comment.