Skip to content

Commit

Permalink
Fix nvtext::generate_character_ngrams performance regression for long…
Browse files Browse the repository at this point in the history
…er strings (#13874)

Fixes performance regression when generating character ngrams. The regression was introduced as part of refactoring common code when adding the `nvtext::hash_character_ngrams` function (Reference #13654). Defactoring the code fixed the regression. Overall, these functions only share about 6 lines of code in common so the defactoring is expected to require minimal maintenance.
The defactoring involves re-instating the original kernel code logic for `nvtext::generate_character_ngrams`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #13874
  • Loading branch information
davidwendt authored Aug 16, 2023
1 parent 20c3aab commit 709b15f
Showing 1 changed file with 42 additions and 73 deletions.
115 changes: 42 additions & 73 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,82 +160,41 @@ namespace detail {
namespace {

/**
* @brief Base class for generating character ngrams
*
* The ngrams are produced for each string and the derived class's
* `process_ngram` function is called for each ngram/substring.
* @brief Generate character ngrams for each string
*
* @tparam Derived class uses the CRTP pattern to reuse code logic.
* Each string produces many strings depending on the ngram width and the string size.
* This functor can be used with `make_strings_children` to build the offsets and
* the chars child columns.
*/
template <typename Derived>
struct base_character_ngram_fn {
struct character_ngram_generator_fn {
cudf::column_device_view const d_strings;
cudf::size_type ngrams;
cudf::size_type const* d_ngram_offsets{};
cudf::size_type* d_offsets{};
char* d_chars{};

base_character_ngram_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets)
: d_strings(d_strings), ngrams(ngrams), d_ngram_offsets(d_ngram_offsets)
{
}

__device__ void operator()(cudf::size_type idx) const
__device__ void operator()(cudf::size_type idx)
{
if (d_strings.is_null(idx)) return;
auto const d_str = d_strings.element<cudf::string_view>(idx);
if (d_str.empty()) return;
auto const& derived = static_cast<Derived const&>(*this);
auto itr = d_str.begin();
auto const ngram_offset = d_ngram_offsets[idx];
auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset;
auto d_sizes = d_offsets + ngram_offset;
auto out_ptr = d_chars ? d_chars + *d_sizes : nullptr;
for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
auto const begin = itr.byte_offset();
auto const end = (itr + ngrams).byte_offset();
auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
derived.process_ngram(ngram, n + ngram_offset);
}
}
};

/**
* @brief Generate character ngrams for each string
*
* Each string produces many strings depending on the ngram width and the string size.
* This functor can be used with `make_strings_children` to build the offsets and
* the chars child columns.
*/
struct character_ngram_generator_fn : base_character_ngram_fn<character_ngram_generator_fn> {
cudf::size_type* d_offsets{};
char* d_chars{};

character_ngram_generator_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets)
: base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets)
{
}

/**
* @brief Called through the base class for each ngram
*
* Either stores the size of each string or copies the string to the output
*
* @param d_str The ngram substring to process
* @param offset The output position relative to d_offsets
*/
__device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
{
auto d_str_offsets = d_offsets + offset;
if (d_chars) {
auto out_ptr = d_chars + *d_str_offsets;
cudf::strings::detail::copy_string(out_ptr, d_str);
} else {
*d_str_offsets = d_str.size_bytes();
if (d_chars) {
out_ptr =
cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
} else {
*d_sizes++ = end - begin;
}
}
}
};

} // namespace

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
Expand All @@ -253,7 +212,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
auto const d_strings = *strings_column;

// create a vector of ngram offsets for each string
rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
Expand All @@ -262,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
[d_strings, strings_count, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
auto const length = d_strings.element<cudf::string_view>(idx).length();
return std::max(0, static_cast<int32_t>(length + 1 - ngrams));
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
},
cudf::size_type{0},
thrust::plus<cudf::size_type>());
Expand All @@ -282,23 +241,33 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie

namespace {
/**
* @brief Computes the hash of each ngram as produced by the base class
* @brief Computes the hash of each character ngram
*
* Each thread processes a single string. Substrings are resolved for every character
* of the string and hashed.
*/
struct character_ngram_hash_fn : base_character_ngram_fn<character_ngram_hash_fn> {
cudf::hash_value_type* d_hashes;

character_ngram_hash_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets,
cudf::hash_value_type* d_hashes)
: base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets), d_hashes(d_hashes)
{
}
struct character_ngram_hash_fn {
cudf::column_device_view const d_strings;
cudf::size_type ngrams;
cudf::size_type const* d_ngram_offsets;
cudf::hash_value_type* d_results;

__device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
__device__ void operator()(cudf::size_type idx) const
{
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
d_hashes[offset] = hasher(d_str);
if (d_strings.is_null(idx)) return;
auto const d_str = d_strings.element<cudf::string_view>(idx);
if (d_str.empty()) return;
auto itr = d_str.begin();
auto const ngram_offset = d_ngram_offsets[idx];
auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset;
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
auto d_hashes = d_results + ngram_offset;
for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
auto const begin = itr.byte_offset();
auto const end = (itr + ngrams).byte_offset();
auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
*d_hashes++ = hasher(ngram);
}
}
};
} // namespace
Expand Down

0 comments on commit 709b15f

Please sign in to comment.