From 60009a8005a8b9b69c2c870465b5cf46532d3388 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 13 Sep 2023 17:12:44 -0500 Subject: [PATCH 01/23] Fix naming issues with `Index.to_frame` and `MultiIndex.to_frame` APIs (#14105) This PR: - [x] Introduces `allow_duplicates` for parity with `MultiIndex.to_frame` - however this parameter is non-functional since cudf doesn't support duplicate column names. - [x] Fixed handling of duplicate index names in `MultiIndex.to_frame` - [x] Added proper docs for `Index.to_frame` & `MultiIndex.to_frame` separately due to change in API signature. - [x] Added tests for `Index.to_frame` & `MultiIndex.to_frame` - [x] Introduced deprecations that will go away when pandas-2.0 support is enabled. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14105 --- python/cudf/cudf/core/_base_index.py | 57 +++++++++++-- python/cudf/cudf/core/multiindex.py | 99 ++++++++++++++++++++--- python/cudf/cudf/tests/test_index.py | 19 +++++ python/cudf/cudf/tests/test_multiindex.py | 83 +++++++++++++++++++ 4 files changed, 242 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2f6e864b51c..c0bd9ec6eee 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -19,6 +19,7 @@ drop_nulls, ) from cudf._lib.types import size_type_dtype +from cudf.api.extensions import no_default from cudf.api.types import ( is_bool_dtype, is_integer, @@ -701,21 +702,65 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default): """Create a DataFrame with a column containing this Index Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column + name : object, defaults to index.name + The passed name should substitute for the index name (if it has + one). + Returns ------- DataFrame - cudf DataFrame - """ - if name is not None: + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves " + "the Index's name or uses a default name of 0. This " + "behaviour is deprecated, and in the future `None` " + "will be used as the name of the " + "resulting DataFrame column.", + FutureWarning, + ) + name = no_default + if name is not no_default: col_name = name elif self.name is None: col_name = 0 diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc6726879c1..21380bb841c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column from cudf.core._compat import PANDAS_GE_150 @@ -1015,7 +1016,12 @@ def __getitem__(self, index): elif isinstance(index, slice): start, stop, step = index.indices(len(self)) index = column.arange(start, stop, step) - result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + result = MultiIndex.from_frame( + self.to_frame(index=False, name=range(0, self.nlevels)).take( + index + ), + names=self.names, + ) # we are indexing into a single row of the MultiIndex, # return that row as a tuple: @@ -1026,24 +1032,95 @@ def __getitem__(self, index): result._codes = self._codes.take(index) if self._levels is not None: result._levels = self._levels - result.names = self.names return result @_cudf_nvtx_annotate - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default, allow_duplicates=False): + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + name : list / sequence of str, optional + The passed names should substitute index level names. + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. Note + that this parameter is non-functional because + duplicates column labels aren't supported in cudf. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf + >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d + """ # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further # modifications of the resulting DataFrame will affect the MultiIndex. - df = cudf.DataFrame._from_data(data=self._data) - if index: - df = df.set_index(self) - if name is not None: + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves the " + "Index's name or uses a default name of 0. This behaviour " + "is deprecated, and in the future `None` will be used " + "as the name of the resulting DataFrame column.", + FutureWarning, + ) + name = no_default + + if name is not no_default: if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " "number of levels on index." ) - df.columns = name + column_names = name + else: + column_names = self.names + all_none_names = None + if not ( + all_none_names := all(x is None for x in column_names) + ) and len(column_names) != len(set(column_names)): + raise ValueError("Duplicate column names are not allowed") + df = cudf.DataFrame._from_data( + data=self._data, + columns=column_names + if name is not no_default and not all_none_names + else None, + ) + + if index: + df = df.set_index(self) + return df @_cudf_nvtx_annotate @@ -1504,7 +1581,9 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, nullable=False, **kwargs): - result = self.to_frame(index=False).to_pandas(nullable=nullable) + result = self.to_frame( + index=False, name=list(range(self.nlevels)) + ).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod @@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self): Convert all na values(if any) in MultiIndex object to `` as a preprocessing step to `__repr__` methods. """ - index_df = self.to_frame(index=False) + index_df = self.to_frame(index=False, name=list(range(self.nlevels))) return MultiIndex.from_frame( index_df._clean_nulls_from_dataframe(index_df), names=self.names ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 6fb615c22e0..b3791cddce3 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 from cudf.core.index import ( @@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype): gidx = cudf.from_pandas(pidx) assert_eq(pidx, gidx) + + +@pytest.mark.parametrize( + "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] +) +@pytest.mark.parametrize("data_name", [None, 1, "abc"]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) +def test_index_to_frame(data, data_name, index, name): + pidx = pd.Index(data, name=data_name) + gidx = cudf.from_pandas(pidx) + + with expect_warning_if(name is None): + expected = pidx.to_frame(index=index, name=name) + with expect_warning_if(name is None): + actual = gidx.to_frame(index=index, name=name) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 3c843ace0a8..fb2b0c07efb 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -16,6 +16,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_GE_200 from cudf.core.column import as_column from cudf.core.index import as_index @@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error(): midx = cudf.MultiIndex.from_tuples([("a", "b")]) with pytest.raises(NotImplementedError): midx.to_series() + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "a", "a"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] +) +@pytest.mark.parametrize("allow_duplicates", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +def test_multiindex_to_frame_allow_duplicates( + pidx, name, allow_duplicates, index +): + gidx = cudf.from_pandas(pidx) + + if ( + ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + ) + and not allow_duplicates + and (name is None or name is no_default) + ): + assert_exceptions_equal( + pidx.to_frame, + gidx.to_frame, + lfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + ) + else: + if ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + and not isinstance(name, list) + ) or (isinstance(name, list) and len(name) != len(set(name))): + # cudf doesn't have the ability to construct dataframes + # with duplicate column names + with expect_warning_if(name is None): + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) + else: + with expect_warning_if(name is None): + expected = pidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + with expect_warning_if(name is None): + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + + assert_eq(expected, actual) From edfef800d98491ee61b390645548f9223bbfb049 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Wed, 13 Sep 2023 16:54:45 -0700 Subject: [PATCH 02/23] Refactor `hash_reduce_by_row` (#14095) This PR extracts `hash_reduce_by_row` function from `distinct_reduce.*` files. Previously, that function was designed specifically to work with `distinct` in stream compaction with `size_type` output. Now, it becomes more generic and can support more generic reduction operations and various output types. No new functionality was added. The changes in this work pave the way for implementing histogram/merge histogram aggregations, which also rely on hash-base reduction. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14095 --- cpp/CMakeLists.txt | 2 +- .../cudf/detail/hash_reduce_by_row.cuh | 167 ++++++++++++++++++ cpp/src/stream_compaction/distinct.cu | 28 +-- cpp/src/stream_compaction/distinct_count.cu | 4 +- cpp/src/stream_compaction/distinct_helpers.cu | 109 ++++++++++++ ...stinct_reduce.cuh => distinct_helpers.hpp} | 12 +- cpp/src/stream_compaction/distinct_reduce.cu | 150 ---------------- .../stream_compaction_common.cuh | 22 --- .../stream_compaction_common.hpp | 5 - 9 files changed, 299 insertions(+), 200 deletions(-) create mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh create mode 100644 cpp/src/stream_compaction/distinct_helpers.cu rename cpp/src/stream_compaction/{distinct_reduce.cuh => distinct_helpers.hpp} (92%) delete mode 100644 cpp/src/stream_compaction/distinct_reduce.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c37d05a21c7..900e9eed98e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -530,7 +530,7 @@ add_library( src/stream_compaction/apply_boolean_mask.cu src/stream_compaction/distinct.cu src/stream_compaction/distinct_count.cu - src/stream_compaction/distinct_reduce.cu + src/stream_compaction/distinct_helpers.cu src/stream_compaction/drop_nans.cu src/stream_compaction/drop_nulls.cu src/stream_compaction/stable_distinct.cu diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh new file mode 100644 index 00000000000..2d2b43f1d4a --- /dev/null +++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +namespace cudf::detail { + +using hash_map_type = + cuco::static_map; + +/** + * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are + * rows that compared equal. + * + * TODO: We need to switch to use `static_reduction_map` when it is ready + * (https://github.com/NVIDIA/cuCollections/pull/98). + */ +template +struct reduce_by_row_fn_base { + protected: + MapView const d_map; + KeyHasher const d_hasher; + KeyEqual const d_equal; + OutputType* const d_output; + + reduce_by_row_fn_base(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + OutputType* const d_output) + : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output} + { + } + + /** + * @brief Return a pointer to the output array at the given index. + * + * @param idx The access index + * @return A pointer to the given index in the output array + */ + __device__ OutputType* get_output_ptr(size_type const idx) const + { + auto const iter = d_map.find(idx, d_hasher, d_equal); + + if (iter != d_map.end()) { + // Only one (undetermined) index value of the duplicate rows could be inserted into the map. + // As such, looking up for all indices of duplicate rows always returns the same value. + auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); + + // All duplicate rows will have concurrent access to this same output slot. + return &d_output[inserted_idx]; + } else { + // All input `idx` values have been inserted into the map before. + // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if + // `d_equal(idx, idx) == false`. + // Such situations are due to comparing nulls or NaNs which are considered as always unequal. + // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct + // output slot. + return &d_output[idx]; + } + } +}; + +/** + * @brief Perform a reduction on groups of rows that are compared equal. + * + * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared + * equal. A hash table is used to find groups of equal rows. + * + * At the beginning of the operation, the entire output array is filled with a value given by + * the `init` parameter. Then, the reduction result for each row group is written into the output + * array at the index of an unspecified row in the group. + * + * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a + * reduction functor derived from `reduce_by_row_fn_base` + * @tparam OutputType Type of the reduction results + * @param map The auxiliary map to perform reduction + * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row + * comparisons + * @param num_rows The number of all input rows + * @param has_nulls Indicate whether the input rows has any nulls at any nested levels + * @param has_nested_columns Indicates whether the input table has any nested columns + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal. + * @param init The initial value for reduction of each row group + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned vector + * @return A device_uvector containing the reduction results + */ +template +rmm::device_uvector hash_reduce_by_row( + hash_map_type const& map, + std::shared_ptr const preprocessed_input, + size_type num_rows, + cudf::nullate::DYNAMIC has_nulls, + bool has_nested_columns, + null_equality nulls_equal, + nan_equality nans_equal, + ReduceFuncBuilder func_builder, + OutputType init, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const map_dview = map.get_device_view(); + auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); + auto const key_hasher = row_hasher.device_hasher(has_nulls); + auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); + + auto reduction_results = rmm::device_uvector(num_rows, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init); + + auto const reduce_by_row = [&](auto const value_comp) { + if (has_nested_columns) { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); + } else { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); + } + }; + + if (nans_equal == nan_equality::ALL_EQUAL) { + using nan_equal_comparator = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator; + reduce_by_row(nan_equal_comparator{}); + } else { + using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; + reduce_by_row(nan_unequal_comparator{}); + } + + return reduction_results; +} + +} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index cc60b2a12ea..cc1e3423d42 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "distinct_reduce.cuh" +#include "distinct_helpers.hpp" #include #include @@ -50,8 +50,8 @@ rmm::device_uvector get_distinct_indices(table_view const& input, } auto map = hash_map_type{compute_hash_table_size(input.num_rows()), - cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL}, - cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL}, + cuco::empty_key{-1}, + cuco::empty_value{std::numeric_limits::min()}, detail::hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; @@ -61,7 +61,7 @@ rmm::device_uvector get_distinct_indices(table_view const& input, auto const has_nested_columns = cudf::detail::has_nested_columns(input); auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); + auto const key_hasher = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); @@ -96,16 +96,16 @@ rmm::device_uvector get_distinct_indices(table_view const& input, } // For other keep options, reduce by row on rows that compare equal. - auto const reduction_results = hash_reduce_by_row(map, - std::move(preprocessed_input), - input.num_rows(), - has_nulls, - has_nested_columns, - keep, - nulls_equal, - nans_equal, - stream, - rmm::mr::get_current_device_resource()); + auto const reduction_results = reduce_by_row(map, + std::move(preprocessed_input), + input.num_rows(), + has_nulls, + has_nested_columns, + keep, + nulls_equal, + nans_equal, + stream, + rmm::mr::get_current_device_resource()); // Extract the desired output indices from reduction results. auto const map_end = [&] { diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 4bca0827efe..ac4811ad279 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -136,14 +136,14 @@ cudf::size_type distinct_count(table_view const& keys, auto const preprocessed_input = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const hash_key = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); + auto const hash_key = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const comparator_helper = [&](auto const row_equal) { using hasher_type = decltype(hash_key); auto key_set = cuco::experimental::static_set{ cuco::experimental::extent{compute_hash_table_size(num_rows)}, - cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL}, + cuco::empty_key{-1}, row_equal, cuco::experimental::linear_probing<1, hasher_type>{hash_key}, detail::hash_table_allocator_type{default_allocator{}, stream}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu new file mode 100644 index 00000000000..8f36ec98f4a --- /dev/null +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distinct_helpers.hpp" + +#include + +namespace cudf::detail { + +namespace { +/** + * @brief The functor to find the first/last/all duplicate row for rows that compared equal. + */ +template +struct reduce_fn : reduce_by_row_fn_base { + duplicate_keep_option const keep; + + reduce_fn(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + duplicate_keep_option const keep, + size_type* const d_output) + : reduce_by_row_fn_base{d_map, + d_hasher, + d_equal, + d_output}, + keep{keep} + { + } + + __device__ void operator()(size_type const idx) const + { + auto const out_ptr = this->get_output_ptr(idx); + + if (keep == duplicate_keep_option::KEEP_FIRST) { + // Store the smallest index of all rows that are equal. + atomicMin(out_ptr, idx); + } else if (keep == duplicate_keep_option::KEEP_LAST) { + // Store the greatest index of all rows that are equal. + atomicMax(out_ptr, idx); + } else { + // Count the number of rows in each group of rows that are compared equal. + atomicAdd(out_ptr, size_type{1}); + } + } +}; + +/** + * @brief The builder to construct an instance of `reduce_fn` functor base on the given + * value of the `duplicate_keep_option` member variable. + */ +struct reduce_func_builder { + duplicate_keep_option const keep; + + template + auto build(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + size_type* const d_output) + { + return reduce_fn{d_map, d_hasher, d_equal, keep, d_output}; + } +}; + +} // namespace + +// This function is split from `distinct.cu` to improve compile time. +rmm::device_uvector reduce_by_row( + hash_map_type const& map, + std::shared_ptr const preprocessed_input, + size_type num_rows, + cudf::nullate::DYNAMIC has_nulls, + bool has_nested_columns, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY, + "This function should not be called with KEEP_ANY"); + + return hash_reduce_by_row(map, + preprocessed_input, + num_rows, + has_nulls, + has_nested_columns, + nulls_equal, + nans_equal, + reduce_func_builder{keep}, + reduction_init_value(keep), + stream, + mr); +} + +} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_helpers.hpp similarity index 92% rename from cpp/src/stream_compaction/distinct_reduce.cuh rename to cpp/src/stream_compaction/distinct_helpers.hpp index 8ec1fa18205..b667d0b04f0 100644 --- a/cpp/src/stream_compaction/distinct_reduce.cuh +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -14,18 +14,14 @@ * limitations under the License. */ -#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" -#include #include #include #include #include #include -#include - -#include namespace cudf::detail { @@ -56,6 +52,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) * - If `keep == KEEP_LAST`: max of row indices in the group. * - If `keep == KEEP_NONE`: count of equivalent rows (group size). * + * Note that this function is not needed when `keep == KEEP_NONE`. + * * At the beginning of the operation, the entire output array is filled with a value given by * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. @@ -68,11 +66,13 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) * @param has_nested_columns Indicates whether the input table has any nested columns * @param keep The parameter to determine what type of reduction to perform * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the reduction results */ -rmm::device_uvector hash_reduce_by_row( +rmm::device_uvector reduce_by_row( hash_map_type const& map, std::shared_ptr const preprocessed_input, size_type num_rows, diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu deleted file mode 100644 index 020e6a495bc..00000000000 --- a/cpp/src/stream_compaction/distinct_reduce.cu +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distinct_reduce.cuh" - -#include -#include -#include - -namespace cudf::detail { - -namespace { -/** - * @brief A functor to perform reduce-by-key with keys are rows that compared equal. - * - * TODO: We need to switch to use `static_reduction_map` when it is ready - * (https://github.com/NVIDIA/cuCollections/pull/98). - */ -template -struct reduce_by_row_fn { - MapView const d_map; - KeyHasher const d_hasher; - KeyEqual const d_equal; - duplicate_keep_option const keep; - size_type* const d_output; - - reduce_by_row_fn(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - duplicate_keep_option const keep, - size_type* const d_output) - : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output} - { - } - - __device__ void operator()(size_type const idx) const - { - auto const out_ptr = get_output_ptr(idx); - - if (keep == duplicate_keep_option::KEEP_FIRST) { - // Store the smallest index of all rows that are equal. - atomicMin(out_ptr, idx); - } else if (keep == duplicate_keep_option::KEEP_LAST) { - // Store the greatest index of all rows that are equal. - atomicMax(out_ptr, idx); - } else { - // Count the number of rows in each group of rows that are compared equal. - atomicAdd(out_ptr, size_type{1}); - } - } - - private: - __device__ size_type* get_output_ptr(size_type const idx) const - { - auto const iter = d_map.find(idx, d_hasher, d_equal); - - if (iter != d_map.end()) { - // Only one index value of the duplicate rows could be inserted into the map. - // As such, looking up for all indices of duplicate rows always returns the same value. - auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); - - // All duplicate rows will have concurrent access to this same output slot. - return &d_output[inserted_idx]; - } else { - // All input `idx` values have been inserted into the map before. - // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if - // `d_equal(idx, idx) == false`. - // Such situations are due to comparing nulls or NaNs which are considered as always unequal. - // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct - // output slot. - return &d_output[idx]; - } - } -}; - -} // namespace - -rmm::device_uvector hash_reduce_by_row( - hash_map_type const& map, - std::shared_ptr const preprocessed_input, - size_type num_rows, - cudf::nullate::DYNAMIC has_nulls, - bool has_nested_columns, - duplicate_keep_option keep, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY, - "This function should not be called with KEEP_ANY"); - - auto reduction_results = rmm::device_uvector(num_rows, stream, mr); - - thrust::uninitialized_fill(rmm::exec_policy(stream), - reduction_results.begin(), - reduction_results.end(), - reduction_init_value(keep)); - - auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); - - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - - auto const reduce_by_row = [&](auto const value_comp) { - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - reduce_by_row_fn{ - map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - reduce_by_row_fn{ - map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); - } - }; - - if (nans_equal == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - reduce_by_row(nan_equal_comparator{}); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - reduce_by_row(nan_unequal_comparator{}); - } - - return reduction_results; -} - -} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 4779cd990fd..839672d6a56 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -29,28 +29,6 @@ namespace cudf { namespace detail { -namespace experimental { - -/** - * @brief Device callable to hash a given row. - */ -template -class compaction_hash { - public: - compaction_hash(RowHash row_hasher) : _hash{row_hasher} {} - - __device__ inline auto operator()(size_type i) const noexcept - { - auto hash = _hash(i); - return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; - } - - private: - RowHash _hash; -}; - -} // namespace experimental - /**  * @brief Device functor to determine if a row is valid.  */ diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp index 0cd2d8f4b14..58d958d2ff4 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.hpp +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -30,11 +30,6 @@ namespace cudf { namespace detail { -constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); -constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); - -using hash_type = cuco::murmurhash3_32; - using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; using hash_map_type = From 664dfc33a29ddb86e671c19f12e2b56e32d46a8b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Sep 2023 14:21:57 -1000 Subject: [PATCH 03/23] Raise NotImplementedError in to_datetime if Z (or tz component) in string (#14074) closes #14039 Avoids this discrepancy when a date string has a tz component ```python In [1]: import pandas In [2]: import cudf In [3]: data = ["2019-01-01T00:00:00.000Z"] In [4]: cudf.to_datetime(data) Out[4]: DatetimeIndex(['2019-01-01'], dtype='datetime64[ns]') In [5]: pandas.to_datetime(data) Out[5]: DatetimeIndex(['2019-01-01 00:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14074 --- python/cudf/cudf/core/column/datetime.py | 15 +++++--- python/cudf/cudf/tests/test_datetime.py | 49 +++++++++++------------- python/cudf/cudf/tests/test_string.py | 12 +++--- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index da6c4fb858c..7775723e267 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str: fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: + if "%z" in fmt or "%Z" in fmt: + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) return fmt element_parts = element.split(".") @@ -651,11 +655,12 @@ def infer_format(element: str, **kwargs) -> str: raise ValueError("Unable to infer the timestamp format from the data") if len(second_parts) > 1: - # "Z" indicates Zulu time(widely used in aviation) - Which is - # UTC timezone that currently cudf only supports. Having any other - # unsupported timezone will let the code fail below - # with a ValueError. - second_parts.remove("Z") + # We may have a non-digit, timezone-like component + # like Z, UTC-3, +01:00 + if any(re.search(r"\D", part) for part in second_parts): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) second_part = "".join(second_parts[1:]) if len(second_part) > 1: diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4c20258ae67..5cab19eedc6 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1250,40 +1250,31 @@ def test_datetime_reductions(data, op, dtype): assert_eq(expected, actual) +@pytest.mark.parametrize("timezone", ["naive", "UTC"]) @pytest.mark.parametrize( "data", [ - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), - timezone="UTC", - ), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), ], ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_infer_format(data, dtype): - sr = cudf.Series(data) - psr = pd.Series(data) +def test_datetime_infer_format(data, timezone, dtype): + ts_data = np.datetime_as_string(data, timezone=timezone) + sr = cudf.Series(ts_data) + if timezone == "naive": + psr = pd.Series(ts_data) - expected = psr.astype(dtype) - actual = sr.astype(dtype) + expected = psr.astype(dtype) + actual = sr.astype(dtype) - assert_eq(expected, actual) + assert_eq(expected, actual) + else: + with pytest.raises(NotImplementedError): + sr.astype(dtype) def test_dateoffset_instance_subclass_check(): @@ -2158,6 +2149,12 @@ def test_format_timezone_not_implemented(code): ) +@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"]) +def test_no_format_timezone_not_implemented(tz): + with pytest.raises(NotImplementedError): + cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) + + @pytest.mark.parametrize("arg", [True, False]) def test_args_not_datetime_typerror(arg): with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2bddd93ccb8..d54027eb707 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -200,12 +200,12 @@ def test_string_astype(dtype): data = ["True", "False", "True", "False", "False"] elif dtype.startswith("datetime64"): data = [ - "2019-06-04T00:00:00Z", - "2019-06-04T12:12:12Z", - "2019-06-03T00:00:00Z", - "2019-05-04T00:00:00Z", - "2018-06-04T00:00:00Z", - "1922-07-21T01:02:03Z", + "2019-06-04T00:00:00", + "2019-06-04T12:12:12", + "2019-06-03T00:00:00", + "2019-05-04T00:00:00", + "2018-06-04T00:00:00", + "1922-07-21T01:02:03", ] elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] From 89557bb0efad2d32098ba86b78e4f4706e7fe88f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 13 Sep 2023 19:22:46 -0500 Subject: [PATCH 04/23] Allow `numeric_only=True` for reduction operations on numeric types (#14111) Fixes: #14090 This PR allows passing `numeric_only=True` for reduction operation on numerical columns. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14111 --- python/cudf/cudf/core/single_column_frame.py | 6 ++- python/cudf/cudf/tests/test_stats.py | 44 ++++++++++---------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7c019f0722c..6a56ab8f3a5 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -49,9 +49,11 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only: + if numeric_only and not isinstance( + self._column, cudf.core.column.numerical_base.NumericalBaseColumn + ): raise NotImplementedError( - f"Series.{op} does not implement numeric_only" + f"Series.{op} does not implement numeric_only." ) try: return getattr(self._column, op)(**kwargs) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 6478fbaad95..463cdb8a7f4 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -247,30 +247,37 @@ def test_misc_quantiles(data, q): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis_series(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_kurtosis_series(data, null_flag, numeric_only): pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - got = data.kurtosis() + got = data.kurtosis(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurtosis() + expected = pdata.kurtosis(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt() + got = data.kurt(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt() + expected = pdata.kurt(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=False) - np.testing.assert_array_almost_equal(got, expected) - with pytest.raises(NotImplementedError): - data.kurt(numeric_only=True) +@pytest.mark.parametrize("op", ["skew", "kurt"]) +def test_kurt_skew_error(op): + gs = cudf.Series(["ab", "cd"]) + ps = gs.to_pandas() + + with pytest.raises(FutureWarning): + assert_exceptions_equal( + getattr(gs, op), + getattr(ps, op), + lfunc_args_and_kwargs=([], {"numeric_only": True}), + rfunc_args_and_kwargs=([], {"numeric_only": True}), + ) @pytest.mark.parametrize( @@ -290,26 +297,19 @@ def test_kurtosis_series(data, null_flag): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew_series(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_skew_series(data, null_flag, numeric_only): pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - got = data.skew() - expected = pdata.skew() + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) - got = data.skew(numeric_only=False) - expected = pdata.skew(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) - - with pytest.raises(NotImplementedError): - data.skew(numeric_only=True) - @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) From 1bfeee7575e137bc75741cb2caf015e55ecab2cd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Sep 2023 14:23:14 -1000 Subject: [PATCH 05/23] Raise NotImplementedError for datetime strings with UTC offset (#14070) Avoids e.g. DatetimeIndex(["2022-07-22 00:00:00+02:00"]) from dropping the +02:00 since timezones are not supported Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14070 --- python/cudf/cudf/core/column/column.py | 18 ++++++++++++++++-- python/cudf/cudf/tests/test_datetime.py | 6 ++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 59ab3569814..d2e2f11a12e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2519,11 +2519,11 @@ def _construct_array( arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype - inferred_dtype = None + inferred_dtype = infer_dtype(arbitrary, skipna=False) if ( dtype is None and not cudf._lib.scalar._is_null_host_scalar(arbitrary) - and (inferred_dtype := infer_dtype(arbitrary, skipna=False)) + and inferred_dtype in ( "mixed", "mixed-integer", @@ -2533,6 +2533,20 @@ def _construct_array( if inferred_dtype == "interval": # Only way to construct an Interval column. return pd.array(arbitrary) + elif ( + inferred_dtype == "string" and getattr(dtype, "kind", None) == "M" + ): + # We may have date-like strings with timezones + try: + pd_arbitrary = pd.to_datetime(arbitrary) + if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) + except pd.errors.OutOfBoundsDatetime: + # https://github.com/pandas-dev/pandas/issues/55096 + pass + arbitrary = np.asarray( arbitrary, dtype=native_dtype diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 5cab19eedc6..0cc7112454c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2141,6 +2141,12 @@ def test_daterange_pandas_compatibility(): assert_eq(expected, actual) +def test_strings_with_utc_offset_not_implemented(): + with pytest.warns(DeprecationWarning, match="parsing timezone"): # cupy + with pytest.raises(NotImplementedError): + DatetimeIndex(["2022-07-22 00:00:00+02:00"]) + + @pytest.mark.parametrize("code", ["z", "Z"]) def test_format_timezone_not_implemented(code): with pytest.raises(NotImplementedError): From 3b691f4be744ff1155df3634cd334211e738e37d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Sep 2023 10:03:52 -1000 Subject: [PATCH 06/23] Raise NotImplementedError in to_datetime with dayfirst without infer_format (#14058) Raises a `NotImplementedError` to avoid this incorrect behavior (which seems to actually not be implemented) ```python In [6]: cudf.to_datetime(["10-02-2014"], dayfirst=True) Out[6]: DatetimeIndex(['2014-10-02'], dtype='datetime64[ns]') ``` closes https://github.com/rapidsai/cudf/issues/14042 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14058 --- python/cudf/cudf/core/tools/datetimes.py | 11 +++---- python/cudf/cudf/tests/test_datetime.py | 38 +++++++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f736e055163..a3f4bacf206 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): format=format, ) else: - if infer_datetime_format and format is None: + if format is None: + if not infer_datetime_format and dayfirst: + raise NotImplementedError( + f"{dayfirst=} not implemented " + f"when {format=} and {infer_datetime_format=}." + ) format = column.datetime.infer_format( element=col.element_indexing(0), dayfirst=dayfirst, ) - elif format is None: - format = column.datetime.infer_format( - element=col.element_indexing(0) - ) return col.as_datetime_column( dtype=_unit_dtype_map[unit], format=format, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 0cc7112454c..164856ed6f5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -617,22 +617,44 @@ def test_datetime_dataframe(): @pytest.mark.parametrize("infer_datetime_format", [True, False]) def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): pd_data = data + is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) + is_string_data = ( + gd_data.ndim == 1 + and not gd_data.empty + and gd_data.dtype.kind == "O" + ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data + is_string_data = isinstance(gd_data, list) and isinstance( + next(iter(gd_data), None), str + ) - expected = pd.to_datetime( - pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - - assert_eq(actual, expected) + if dayfirst and not infer_datetime_format and is_string_data: + # Note: pandas<2.0 also does not respect dayfirst=True correctly + # for object data + with pytest.raises(NotImplementedError): + cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + else: + expected = pd.to_datetime( + pd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + actual = cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + assert_eq(actual, expected) @pytest.mark.parametrize( From 4ca568e764a3898bf619a221cdb91a9261df22bf Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 18 Sep 2023 09:00:39 -0500 Subject: [PATCH 07/23] Update pyarrow-related dispatch logic in dask_cudf (#14069) Updates `dask_cudf` dispatch logic to avoid breakage from https://github.com/dask/dask/pull/10500. Also removes stale `try`/`except` logic. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Ray Douglass (https://github.com/raydouglass) - gpuCI (https://github.com/GPUtester) - Mike Wendt (https://github.com/mike-wendt) - AJ Schmidt (https://github.com/ajschmidt8) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14069 --- python/dask_cudf/dask_cudf/backends.py | 69 +++++++++---------- .../dask_cudf/tests/test_dispatch.py | 21 ++++-- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 2470b4d50f1..e3f4f04eb85 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -20,11 +20,14 @@ from dask.dataframe.dispatch import ( categorical_dtype_dispatch, concat_dispatch, + from_pyarrow_table_dispatch, group_split_dispatch, grouper_dispatch, hash_object_dispatch, is_categorical_dtype_dispatch, make_meta_dispatch, + pyarrow_schema_dispatch, + to_pyarrow_table_dispatch, tolist_dispatch, union_categoricals_dispatch, ) @@ -317,16 +320,6 @@ def get_grouper_cudf(obj): return cudf.core.groupby.Grouper -try: - from dask.dataframe.dispatch import pyarrow_schema_dispatch - - @pyarrow_schema_dispatch.register((cudf.DataFrame,)) - def get_pyarrow_schema_cudf(obj): - return obj.to_arrow().schema - -except ImportError: - pass - try: try: from dask.array.dispatch import percentile_lookup @@ -378,35 +371,37 @@ def percentile_cudf(a, q, interpolation="linear"): except ImportError: pass -try: - # Requires dask>2023.6.0 - from dask.dataframe.dispatch import ( - from_pyarrow_table_dispatch, - to_pyarrow_table_dispatch, - ) - @to_pyarrow_table_dispatch.register(cudf.DataFrame) - def _cudf_to_table(obj, preserve_index=True, **kwargs): - if kwargs: - warnings.warn( - "Ignoring the following arguments to " - f"`to_pyarrow_table_dispatch`: {list(kwargs)}" - ) - return obj.to_arrow(preserve_index=preserve_index) - - @from_pyarrow_table_dispatch.register(cudf.DataFrame) - def _table_to_cudf(obj, table, self_destruct=None, **kwargs): - # cudf ignores self_destruct. - kwargs.pop("self_destruct", None) - if kwargs: - warnings.warn( - f"Ignoring the following arguments to " - f"`from_pyarrow_table_dispatch`: {list(kwargs)}" - ) - return obj.from_arrow(table) +@pyarrow_schema_dispatch.register((cudf.DataFrame,)) +def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs): + if kwargs: + warnings.warn( + "Ignoring the following arguments to " + f"`pyarrow_schema_dispatch`: {list(kwargs)}" + ) + return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema -except ImportError: - pass + +@to_pyarrow_table_dispatch.register(cudf.DataFrame) +def _cudf_to_table(obj, preserve_index=True, **kwargs): + if kwargs: + warnings.warn( + "Ignoring the following arguments to " + f"`to_pyarrow_table_dispatch`: {list(kwargs)}" + ) + return obj.to_arrow(preserve_index=preserve_index) + + +@from_pyarrow_table_dispatch.register(cudf.DataFrame) +def _table_to_cudf(obj, table, self_destruct=None, **kwargs): + # cudf ignores self_destruct. + kwargs.pop("self_destruct", None) + if kwargs: + warnings.warn( + f"Ignoring the following arguments to " + f"`from_pyarrow_table_dispatch`: {list(kwargs)}" + ) + return obj.from_arrow(table) @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index 22cc0f161e2..cf49b1df4f4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -3,9 +3,7 @@ import numpy as np import pandas as pd import pytest -from packaging import version -import dask from dask.base import tokenize from dask.dataframe import assert_eq from dask.dataframe.methods import is_categorical_dtype @@ -24,10 +22,6 @@ def test_is_categorical_dispatch(): assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category")) -@pytest.mark.skipif( - version.parse(dask.__version__) <= version.parse("2023.6.0"), - reason="Pyarrow-conversion dispatch requires dask>2023.6.0", -) def test_pyarrow_conversion_dispatch(): from dask.dataframe.dispatch import ( from_pyarrow_table_dispatch, @@ -79,3 +73,18 @@ def test_deterministic_tokenize(index): df2 = df.set_index(["B", "C"], drop=False) assert tokenize(df) != tokenize(df2) assert tokenize(df2) == tokenize(df2) + + +@pytest.mark.parametrize("preserve_index", [True, False]) +def test_pyarrow_schema_dispatch(preserve_index): + from dask.dataframe.dispatch import ( + pyarrow_schema_dispatch, + to_pyarrow_table_dispatch, + ) + + df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc")) + df["d"] = cudf.Series(["cat", "dog"] * 5) + table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index) + schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index) + + assert schema.equals(table.schema) From 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 18 Sep 2023 09:53:18 -0500 Subject: [PATCH 08/23] Drop `kwargs` from `Series.count` (#14106) Fixes: #14089 This PR drops `kwargs` from `Series.count` method signature. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/14106 --- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_series.py | 6 ++++++ python/dask_cudf/dask_cudf/core.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f44a3123dd3..7692d3015f8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2549,7 +2549,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): # Stats # @_cudf_nvtx_annotate - def count(self, level=None, **kwargs): + def count(self, level=None): """ Return number of non-NA/null observations in the Series diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 798809b0ada..b1e991106ee 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2311,3 +2311,9 @@ def test_series_round_builtin(data, digits): actual = round(gs, digits) assert_eq(expected, actual) + + +def test_series_count_invalid_param(): + s = cudf.Series([]) + with pytest.raises(TypeError): + s.count(skipna=True) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index d2858876fcd..5b37e6e825c 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -421,7 +421,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: - n = x.count(skipna=skipna) + n = x.count() avg = x.mean(skipna=skipna) else: # Not skipping nulls, so might as well From 8e081c015417c5a8d2a99f9db6bbc9a2c438e477 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 18 Sep 2023 12:51:08 -0500 Subject: [PATCH 09/23] Add support for nested dict in `DataFrame` constructor (#14119) Fixes: #14096 This PR enables nested dict initialization support in `DataFrame` constructor. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14119 --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5a3d25a08a7..4fc175512a0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -977,7 +977,7 @@ def _align_input_series_indices(data, index): input_series = [ Series(val) for val in data.values() - if isinstance(val, (pd.Series, Series)) + if isinstance(val, (pd.Series, Series, dict)) ] if input_series: @@ -994,7 +994,7 @@ def _align_input_series_indices(data, index): index = aligned_input_series[0].index for name, val in data.items(): - if isinstance(val, (pd.Series, Series)): + if isinstance(val, (pd.Series, Series, dict)): data[name] = aligned_input_series.pop(0) return data, index diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 61372bab3ad..652bdbbee45 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10349,3 +10349,22 @@ def test_dataframe_round_builtin(digits): actual = round(gdf, digits) assert_eq(expected, actual) + + +def test_dataframe_init_from_nested_dict(): + ordered_dict = OrderedDict( + [ + ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), + ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), + ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), + ] + ) + pdf = pd.DataFrame(ordered_dict) + gdf = cudf.DataFrame(ordered_dict) + + assert_eq(pdf, gdf) + regular_dict = {key: dict(value) for key, value in ordered_dict.items()} + + pdf = pd.DataFrame(regular_dict) + gdf = cudf.DataFrame(regular_dict) + assert_eq(pdf, gdf) From 4467066c952111c0131383784d3eb6bf3248f0ac Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 18 Sep 2023 12:51:53 -0500 Subject: [PATCH 10/23] Restrict iterables of `DataFrame`'s as input to `DataFrame` constructor (#14118) Fixes: #14094 This PR raises an error when an iterates of `DataFrame`'s is detected in `DataFrame` constructor. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14118 --- python/cudf/cudf/core/dataframe.py | 11 ++++++----- python/cudf/cudf/tests/test_dataframe.py | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4fc175512a0..84c16b71997 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -852,12 +852,13 @@ def _init_from_list_like(self, data, index=None, columns=None): elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data + elif any( + not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data + ): + raise TypeError("Inputs should be an iterable or sequence.") + elif len(data) > 0 and not can_convert_to_column(data[0]): + raise ValueError("Must pass 2-d input.") else: - if any( - not isinstance(col, (abc.Iterable, abc.Sequence)) - for col in data - ): - raise TypeError("Inputs should be an iterable or sequence.") if ( len(data) > 0 and columns is None diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 652bdbbee45..cbef9bfa2d8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10260,6 +10260,12 @@ def __getitem__(self, key): cudf.DataFrame({"a": A()}) +def test_dataframe_constructor_dataframe_list(): + df = cudf.DataFrame(range(2)) + with pytest.raises(ValueError): + cudf.DataFrame([df]) + + def test_dataframe_constructor_from_namedtuple(): Point1 = namedtuple("Point1", ["a", "b", "c"]) Point2 = namedtuple("Point1", ["x", "y"]) From 2acd3dfa9e859feb4d803d9446c89b80f10bd54a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 18 Sep 2023 14:10:14 -0700 Subject: [PATCH 11/23] Expand statistics support in ORC writer (#13848) Closes #7087, closes #13793, closes #13899 This PR adds support for several cases and statistics types: - sum statistics are included even when all elements are null (no minmax); - sum statistics are included in double stats; - minimum/maximum and minimumNanos/maximumNanos are included in timestamp stats; - hasNull field is written for all columns. - decimal statistics Added tests for all supported stats. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Robert (Bobby) Evans (https://github.com/revans2) - Vyas Ramasubramani (https://github.com/vyasr) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/13848 --- cpp/include/cudf/io/orc_metadata.hpp | 10 +- .../detail/convert/fixed_point_to_string.cuh | 80 +++++++++ cpp/src/io/orc/orc.cpp | 4 +- cpp/src/io/orc/stats_enc.cu | 169 +++++++++++++----- cpp/src/io/parquet/page_enc.cu | 4 +- .../statistics_type_identification.cuh | 19 +- .../io/statistics/typed_statistics_chunk.cuh | 2 +- .../strings/convert/convert_fixed_point.cu | 54 +----- cpp/tests/io/orc_test.cpp | 109 +++++++++-- python/cudf/cudf/tests/test_orc.py | 60 ++++--- 10 files changed, 356 insertions(+), 155 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 623ee2e49fc..82d59803c25 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -111,10 +111,10 @@ struct string_statistics : minmax_statistics, sum_statistics count; ///< Count of `false` and `true` values + std::vector count; ///< count of `true` values }; /** @@ -141,8 +141,10 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; ///< minimum in milliseconds - std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_nanos; ///< nanoseconds part of the minimum + std::optional maximum_nanos; ///< nanoseconds part of the maximum }; namespace orc { diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh new file mode 100644 index 00000000000..0ee26ec9ee2 --- /dev/null +++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::strings::detail { + +/** + * @brief Returns the number of digits in the given fixed point number. + * + * @param value The value of the fixed point number + * @param scale The scale of the fixed point number + * @return int32_t The number of digits required to represent the fixed point number + */ +__device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32_t scale) +{ + if (scale >= 0) return count_digits(value) + scale; + + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const fraction = count_digits(abs_value % exp_ten); + auto const num_zeros = std::max(0, (-scale - fraction)); + return static_cast(value < 0) + // sign if negative + count_digits(abs_value / exp_ten) + // integer + 1 + // decimal point + num_zeros + // zeros padding + fraction; // size of fraction +} + +/** + * @brief Converts the given fixed point number to a string. + * + * Caller is responsible for ensuring that the output buffer is large enough. The required output + * buffer size can be obtained by calling `fixed_point_string_size`. + * + * @param value The value of the fixed point number + * @param scale The scale of the fixed point number + * @param out_ptr The pointer to the output string + */ +__device__ inline void fixed_point_to_string(__int128_t const& value, int32_t scale, char* out_ptr) +{ + if (scale >= 0) { + out_ptr += integer_to_string(value, out_ptr); + thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; }); // add zeros + return; + } + + // scale < 0 + // write format: [-]integer.fraction + // where integer = abs(value) / (10^abs(scale)) + // fraction = abs(value) % (10^abs(scale)) + if (value < 0) *out_ptr++ = '-'; // add sign + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + + out_ptr += integer_to_string(abs_value / exp_ten, out_ptr); // add the integer part + *out_ptr++ = '.'; // add decimal point + + thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; }); // add zeros + out_ptr += num_zeros; + + integer_to_string(abs_value % exp_ten, out_ptr); // add the fraction part +} + +} // namespace cudf::strings::detail diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index fc50b7118be..bc399b75ef9 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -178,7 +178,9 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) auto op = std::tuple(field_reader(1, s.minimum), field_reader(2, s.maximum), field_reader(3, s.minimum_utc), - field_reader(4, s.maximum_utc)); + field_reader(4, s.maximum_utc), + field_reader(5, s.minimum_nanos), + field_reader(6, s.maximum_nanos)); function_builder(s, maxlen, op); } diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 069841980c1..69d7ec95acd 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -16,15 +16,16 @@ #include "orc_gpu.hpp" -#include #include +#include +#include + #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::gpu { + +using strings::detail::fixed_point_string_size; constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; @@ -58,13 +59,14 @@ __global__ void __launch_bounds__(init_threads_per_block) constexpr unsigned int buffersize_reduction_dim = 32; constexpr unsigned int block_size = buffersize_reduction_dim * buffersize_reduction_dim; constexpr unsigned int pb_fld_hdrlen = 1; -constexpr unsigned int pb_fld_hdrlen16 = 2; // > 127-byte length -constexpr unsigned int pb_fld_hdrlen32 = 5; // > 16KB length +constexpr unsigned int pb_fld_hdrlen32 = 5; +constexpr unsigned int pb_fldlen_int32 = 5; constexpr unsigned int pb_fldlen_int64 = 10; constexpr unsigned int pb_fldlen_float64 = 8; -constexpr unsigned int pb_fldlen_decimal = 40; // Assume decimal2string fits in 40 characters constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64; -constexpr unsigned int pb_fldlen_common = 2 * pb_fld_hdrlen + pb_fldlen_int64; +// statistics field number + number of values + has null +constexpr unsigned int pb_fldlen_common = + pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen; template __global__ void __launch_bounds__(block_size, 1) @@ -87,21 +89,32 @@ __global__ void __launch_bounds__(block_size, 1) case dtype_int8: case dtype_int16: case dtype_int32: - case dtype_date32: case dtype_int64: - case dtype_timestamp64: stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_int64); break; + case dtype_date32: + stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); + break; + case dtype_timestamp64: + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) + + 2 * (pb_fld_hdrlen + pb_fldlen_int32); + break; case dtype_float32: case dtype_float64: stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_float64); break; case dtype_decimal64: - case dtype_decimal128: - stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fldlen_decimal); - break; + case dtype_decimal128: { + auto const scale = groups[idx].col_dtype.scale(); + auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale); + auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale); + auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale); + // common + total field length + encoded string lengths + strings + stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) + + min_size + max_size + sum_size; + } break; case dtype_string: - stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fldlen_int64) + + stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) + chunks[idx].min_value.str_val.length + chunks[idx].max_value.str_val.length; break; case dtype_none: stats_len = pb_fldlen_common; @@ -126,9 +139,6 @@ struct stats_state_s { statistics_chunk chunk; statistics_merge_group group; statistics_dtype stats_dtype; //!< Statistics data type for this column - // ORC stats - uint64_t numberOfValues; - uint8_t hasNull; }; /* @@ -178,6 +188,15 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, void const* by return p + len; } +__device__ inline uint8_t* pb_put_decimal( + uint8_t* p, uint32_t id, __int128_t value, int32_t scale, int32_t len) +{ + p[0] = id * 8 + ProtofType::FIXEDLEN; + p = pb_encode_uint(p + 1, len); + strings::detail::fixed_point_to_string(value, scale, reinterpret_cast(p)); + return p + len; +} + // Protobuf field encoding for 64-bit raw encoding (double) __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* raw64) { @@ -186,6 +205,15 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r return p + 9; } +// Splits a nanosecond timestamp into milliseconds and nanoseconds +__device__ std::pair split_nanosecond_timestamp(int64_t nano_count) +{ + auto const ns = cuda::std::chrono::nanoseconds(nano_count); + auto const ms_floor = cuda::std::chrono::floor(ns); + auto const ns_remainder = ns - ms_floor; + return {ms_floor.count(), ns_remainder.count()}; +} + /** * @brief Encode statistics in ORC protobuf format * @@ -228,12 +256,14 @@ __global__ void __launch_bounds__(encode_threads_per_block) // Encode and update actual bfr size if (idx < statistics_count && t == 0) { - s->chunk = chunks[idx]; - s->group = groups[idx]; - s->stats_dtype = s->group.stats_dtype; - s->base = blob_bfr + s->group.start_chunk; - s->end = blob_bfr + s->group.start_chunk + s->group.num_chunks; - uint8_t* cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); + s->chunk = chunks[idx]; + s->group = groups[idx]; + s->stats_dtype = s->group.stats_dtype; + s->base = blob_bfr + s->group.start_chunk; + s->end = blob_bfr + s->group.start_chunk + s->group.num_chunks; + uint8_t* cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); + cur = pb_put_uint(cur, 10, s->chunk.null_count != 0); // hasNull (bool) + uint8_t* fld_start = cur; switch (s->stats_dtype) { case dtype_int8: @@ -265,11 +295,14 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional double maximum = 2; // optional double sum = 3; // } - if (s->chunk.has_minmax) { + if (s->chunk.has_minmax || s->chunk.has_sum) { *cur = 3 * 8 + ProtofType::FIXEDLEN; cur += 2; - cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val); - cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val); + if (s->chunk.has_minmax) { + cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val); + cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val); + } + if (s->chunk.has_sum) { cur = pb_put_fixed64(cur, 3, &s->chunk.sum.fp_val); } fld_start[1] = cur - (fld_start + 2); } break; @@ -280,18 +313,25 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional string maximum = 2; // optional sint64 sum = 3; // sum will store the total length of all strings // } - if (s->chunk.has_minmax && s->chunk.has_sum) { - uint32_t sz = (pb_put_int(cur, 3, s->chunk.sum.i_val) - cur) + - (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) + - (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) + - s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length; + if (s->chunk.has_minmax || s->chunk.has_sum) { + uint32_t sz = 0; + if (s->chunk.has_minmax) { + sz += (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) + + (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) + + s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length; + } + if (s->chunk.has_sum) { sz += pb_put_int(cur, 3, s->chunk.sum.i_val) - cur; } + cur[0] = 4 * 8 + ProtofType::FIXEDLEN; cur = pb_encode_uint(cur + 1, sz); - cur = pb_put_binary( - cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length); - cur = pb_put_binary( - cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length); - cur = pb_put_int(cur, 3, s->chunk.sum.i_val); + + if (s->chunk.has_minmax) { + cur = pb_put_binary( + cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length); + cur = pb_put_binary( + cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length); + } + if (s->chunk.has_sum) { cur = pb_put_int(cur, 3, s->chunk.sum.i_val); } } break; case dtype_bool: @@ -299,8 +339,9 @@ __global__ void __launch_bounds__(encode_threads_per_block) // message BucketStatistics { // repeated uint64 count = 1 [packed=true]; // } - if (s->chunk.has_sum) { // Sum is equal to the number of 'true' values - cur[0] = 5 * 8 + ProtofType::FIXEDLEN; + if (s->chunk.has_sum) { + cur[0] = 5 * 8 + ProtofType::FIXEDLEN; + // count is equal to the number of 'true' values, despite what specs say cur = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val); fld_start[1] = cur - (fld_start + 2); } @@ -313,8 +354,33 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional string maximum = 2; // optional string sum = 3; // } - if (s->chunk.has_minmax) { - // TODO: Decimal support (decimal min/max stored as strings) + if (s->chunk.has_minmax or s->chunk.has_sum) { + auto const scale = s->group.col_dtype.scale(); + + uint32_t sz = 0; + auto const min_size = + s->chunk.has_minmax ? fixed_point_string_size(s->chunk.min_value.d128_val, scale) : 0; + auto const max_size = + s->chunk.has_minmax ? fixed_point_string_size(s->chunk.max_value.d128_val, scale) : 0; + if (s->chunk.has_minmax) { + // encoded string lengths, plus the strings + sz += (pb_put_uint(cur, 1, min_size) - cur) + min_size + + (pb_put_uint(cur, 1, max_size) - cur) + max_size; + } + auto const sum_size = + s->chunk.has_sum ? fixed_point_string_size(s->chunk.sum.d128_val, scale) : 0; + if (s->chunk.has_sum) { sz += (pb_put_uint(cur, 1, sum_size) - cur) + sum_size; } + + cur[0] = 6 * 8 + ProtofType::FIXEDLEN; + cur = pb_encode_uint(cur + 1, sz); + + if (s->chunk.has_minmax) { + cur = pb_put_decimal(cur, 1, s->chunk.min_value.d128_val, scale, min_size); // minimum + cur = pb_put_decimal(cur, 2, s->chunk.max_value.d128_val, scale, max_size); // maximum + } + if (s->chunk.has_sum) { + cur = pb_put_decimal(cur, 3, s->chunk.sum.d128_val, scale, sum_size); // sum + } } break; case dtype_date32: @@ -338,12 +404,24 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 maximum = 2; // optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch // optional sint64 maximumUtc = 4; + // optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond + // precision optional int32 maximumNanos = 6; // } if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; cur += 2; - cur = pb_put_int(cur, 3, s->chunk.min_value.i_val); // minimumUtc - cur = pb_put_int(cur, 4, s->chunk.max_value.i_val); // maximumUtc + auto const [min_ms, min_ns_remainder] = + split_nanosecond_timestamp(s->chunk.min_value.i_val); + auto const [max_ms, max_ns_remainder] = + split_nanosecond_timestamp(s->chunk.max_value.i_val); + + // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC + cur = pb_put_int(cur, 1, min_ms); // minimum + cur = pb_put_int(cur, 2, max_ms); // maximum + cur = pb_put_int(cur, 3, min_ms); // minimumUtc + cur = pb_put_int(cur, 4, max_ms); // maximumUtc + cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos + cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos fld_start[1] = cur - (fld_start + 2); } break; @@ -403,7 +481,4 @@ void orc_encode_statistics(uint8_t* blob_bfr, blob_bfr, groups, chunks, statistics_count); } -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::gpu diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 0af561be8da..fe0dbb85124 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1858,8 +1858,8 @@ __device__ std::pair get_extremum(statistics_val const* s } case dtype_int64: case dtype_timestamp64: - case dtype_float64: - case dtype_decimal64: return {stats_val, sizeof(int64_t)}; + case dtype_float64: return {stats_val, sizeof(int64_t)}; + case dtype_decimal64: case dtype_decimal128: byte_reverse128(stats_val->d128_val, scratch); return {scratch, sizeof(__int128_t)}; diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 32931d7d34d..ea8c71f0dcb 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -49,15 +49,15 @@ enum class is_int96_timestamp { YES, NO }; template struct conversion_map; -// Every timestamp or duration type is converted to milliseconds in ORC statistics +// Every timestamp or duration type is converted to nanoseconds in ORC statistics template struct conversion_map { - using types = std::tuple, - std::pair, - std::pair, - std::pair, - std::pair, - std::pair>; + using types = std::tuple, + std::pair, + std::pair, + std::pair, + std::pair, + std::pair>; }; // In Parquet timestamps and durations with second resolution are converted to @@ -125,7 +125,7 @@ class extrema_type { using non_arithmetic_extrema_type = typename std::conditional_t< cudf::is_fixed_point() or cudf::is_duration() or cudf::is_timestamp(), - typename std::conditional_t, __int128_t, int64_t>, + typename std::conditional_t(), __int128_t, int64_t>, typename std::conditional_t< std::is_same_v, string_view, @@ -134,8 +134,7 @@ class extrema_type { // unsigned int/bool -> uint64_t // signed int -> int64_t // float/double -> double - // decimal32/64 -> int64_t - // decimal128 -> __int128_t + // decimal32/64/128 -> __int128_t // duration_[T] -> int64_t // string_view -> string_view // byte_array_view -> byte_array_view diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index d007209a12a..e6ec1471cb7 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -244,9 +244,9 @@ get_untyped_chunk(typed_statistics_chunk const& chunk) stat.null_count = chunk.null_count; stat.has_minmax = chunk.has_minmax; stat.has_sum = [&]() { - if (!chunk.has_minmax) return false; // invalidate the sum if overflow or underflow is possible if constexpr (std::is_floating_point_v or std::is_integral_v) { + if (!chunk.has_minmax) { return true; } return std::numeric_limits::max() / chunk.non_nulls >= static_cast(chunk.maximum_value) and std::numeric_limits::lowest() / chunk.non_nulls <= diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index a3336258d3e..51aab9faeba 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -200,62 +200,19 @@ struct from_fixed_point_fn { size_type* d_offsets{}; char* d_chars{}; - /** - * @brief Calculates the size of the string required to convert the element, in base-10 format. - * - * Output format is [-]integer.fraction - */ - __device__ int32_t compute_output_size(DecimalType value) - { - auto const scale = d_decimals.type().scale(); - - if (scale >= 0) return count_digits(value) + scale; - - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10(-scale); - auto const fraction = count_digits(abs_value % exp_ten); - auto const num_zeros = std::max(0, (-scale - fraction)); - return static_cast(value < 0) + // sign if negative - count_digits(abs_value / exp_ten) + // integer - 1 + // decimal point - num_zeros + // zeros padding - fraction; // size of fraction - } - /** * @brief Converts a decimal element into a string. * * The value is converted into base-10 digits [0-9] * plus the decimal point and a negative sign prefix. */ - __device__ void decimal_to_string(size_type idx) + __device__ void fixed_point_element_to_string(size_type idx) { auto const value = d_decimals.element(idx); auto const scale = d_decimals.type().scale(); char* d_buffer = d_chars + d_offsets[idx]; - if (scale >= 0) { - d_buffer += integer_to_string(value, d_buffer); - thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); // add zeros - return; - } - - // scale < 0 - // write format: [-]integer.fraction - // where integer = abs(value) / (10^abs(scale)) - // fraction = abs(value) % (10^abs(scale)) - if (value < 0) *d_buffer++ = '-'; // add sign - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10(-scale); - auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); - - d_buffer += integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part - *d_buffer++ = '.'; // add decimal point - - thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); // add zeros - d_buffer += num_zeros; - - integer_to_string(abs_value % exp_ten, d_buffer); // add the fraction part + fixed_point_to_string(value, scale, d_buffer); } __device__ void operator()(size_type idx) @@ -265,9 +222,10 @@ struct from_fixed_point_fn { return; } if (d_chars != nullptr) { - decimal_to_string(idx); + fixed_point_element_to_string(idx); } else { - d_offsets[idx] = compute_output_size(d_decimals.element(idx)); + d_offsets[idx] = + fixed_point_string_size(d_decimals.element(idx), d_decimals.type().scale()); } } }; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cff7b1cf081..890ef914713 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -976,6 +976,10 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest) TEST_F(OrcStatisticsTest, Basic) { auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto ts_sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; }); + auto dec_sequence = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; }); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); std::vector strings{ @@ -986,11 +990,17 @@ TEST_F(OrcStatisticsTest, Basic) sequence, sequence + num_rows, validity); column_wrapper col2( sequence, sequence + num_rows, validity); - column_wrapper col3{strings.begin(), strings.end()}; - column_wrapper col4(sequence, sequence + num_rows); - column_wrapper col5( - sequence, sequence + num_rows, validity); - table_view expected({col1, col2, col3, col4, col5}); + str_col col3{strings.begin(), strings.end()}; + column_wrapper col4( + ts_sequence, ts_sequence + num_rows, validity); + column_wrapper col5( + ts_sequence, ts_sequence + num_rows, validity); + bool_col col6({true, true, true, true, true, false, false, false, false}, validity); + + cudf::test::fixed_point_column_wrapper col7( + dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1}); + + table_view expected({col1, col2, col3, col4, col5, col6, col7}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); @@ -1000,16 +1010,21 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); - auto const expected_column_names = - std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4"}; + auto expected_column_names = std::vector{""}; + std::generate_n( + std::back_inserter(expected_column_names), + expected.num_columns(), + [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); }); EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { + ASSERT_EQ(stats.size(), expected.num_columns() + 1); auto& s0 = stats[0]; EXPECT_EQ(*s0.number_of_values, 9ul); auto& s1 = stats[1]; EXPECT_EQ(*s1.number_of_values, 4ul); + EXPECT_TRUE(*s1.has_null); auto& ts1 = std::get(s1.type_specific_stats); EXPECT_EQ(*ts1.minimum, 1); EXPECT_EQ(*ts1.maximum, 7); @@ -1017,30 +1032,55 @@ TEST_F(OrcStatisticsTest, Basic) auto& s2 = stats[2]; EXPECT_EQ(*s2.number_of_values, 4ul); + EXPECT_TRUE(*s2.has_null); auto& ts2 = std::get(s2.type_specific_stats); EXPECT_EQ(*ts2.minimum, 1.); EXPECT_EQ(*ts2.maximum, 7.); - // No sum ATM, filed #7087 - ASSERT_FALSE(ts2.sum); + EXPECT_EQ(*ts2.sum, 16.); auto& s3 = stats[3]; EXPECT_EQ(*s3.number_of_values, 9ul); + EXPECT_FALSE(*s3.has_null); auto& ts3 = std::get(s3.type_specific_stats); EXPECT_EQ(*ts3.minimum, "Friday"); EXPECT_EQ(*ts3.maximum, "Wednesday"); EXPECT_EQ(*ts3.sum, 58ul); auto& s4 = stats[4]; - EXPECT_EQ(*s4.number_of_values, 9ul); - EXPECT_EQ(std::get(s4.type_specific_stats).count[0], 8ul); + EXPECT_EQ(*s4.number_of_values, 4ul); + EXPECT_TRUE(*s4.has_null); + auto& ts4 = std::get(s4.type_specific_stats); + EXPECT_EQ(*ts4.minimum, -4); + EXPECT_EQ(*ts4.maximum, 3); + EXPECT_EQ(*ts4.minimum_utc, -4); + EXPECT_EQ(*ts4.maximum_utc, 3); + EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_EQ(*ts4.maximum_nanos, 6); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); + EXPECT_TRUE(*s5.has_null); auto& ts5 = std::get(s5.type_specific_stats); - EXPECT_EQ(*ts5.minimum_utc, 1000); - EXPECT_EQ(*ts5.maximum_utc, 7000); - ASSERT_FALSE(ts5.minimum); - ASSERT_FALSE(ts5.maximum); + EXPECT_EQ(*ts5.minimum, -3001); + EXPECT_EQ(*ts5.maximum, 3000); + EXPECT_EQ(*ts5.minimum_utc, -3001); + EXPECT_EQ(*ts5.maximum_utc, 3000); + EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_EQ(*ts5.maximum_nanos, 6000); + + auto& s6 = stats[6]; + EXPECT_EQ(*s6.number_of_values, 4ul); + EXPECT_TRUE(*s6.has_null); + auto& ts6 = std::get(s6.type_specific_stats); + EXPECT_EQ(ts6.count[0], 2); + + auto& s7 = stats[7]; + EXPECT_EQ(*s7.number_of_values, 9ul); + EXPECT_FALSE(*s7.has_null); + auto& ts7 = std::get(s7.type_specific_stats); + EXPECT_EQ(*ts7.minimum, "0.0"); + EXPECT_EQ(*ts7.maximum, "800.8"); + EXPECT_EQ(*ts7.sum, "3603.6"); }; validate_statistics(stats.file_stats); @@ -1259,9 +1299,8 @@ TEST_F(OrcStatisticsTest, Overflow) TEST_F(OrcStatisticsTest, HasNull) { - // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so - // we're embedding a file created using pyorc - // + // This test can now be implemented with libcudf; keeping the pyorc version to keep the test + // inputs diversified // Method to create file: // >>> import pyorc // >>> output = open("./temp.orc", "wb") @@ -1861,4 +1900,38 @@ TEST_F(OrcWriterTest, EmptyChildStringColumn) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +template +void check_all_null_stats(cudf::io::column_statistics const& stats) +{ + EXPECT_EQ(stats.number_of_values, 0); + EXPECT_TRUE(stats.has_null); + + auto const ts = std::get(stats.type_specific_stats); + EXPECT_FALSE(ts.minimum.has_value()); + EXPECT_FALSE(ts.maximum.has_value()); + EXPECT_TRUE(ts.sum.has_value()); + EXPECT_EQ(*ts.sum, 0); +} + +TEST_F(OrcStatisticsTest, AllNulls) +{ + float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls()); + int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls()); + str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls()); + + cudf::table_view expected({int_col, double_col, string_col}); + + std::vector out_buffer; + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected); + cudf::io::write_orc(out_opts); + + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + + check_all_null_stats(stats.file_stats[1]); + check_all_null_stats(stats.file_stats[2]); + check_all_null_stats(stats.file_stats[3]); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index aafc8831bf4..07aa5430f4f 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -633,16 +633,19 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): for col in gdf: if "minimum" in file_stats[0][col]: stats_min = file_stats[0][col]["minimum"] - actual_min = gdf[col].min() - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = gdf[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in file_stats[0][col]: stats_max = file_stats[0][col]["maximum"] - actual_max = gdf[col].max() - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = gdf[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in file_stats[0][col]: stats_num_vals = file_stats[0][col]["number_of_values"] - actual_num_vals = gdf[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = gdf[col].count() + assert stats_num_vals == actual_num_vals # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): @@ -651,21 +654,24 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): stripe_df = cudf.DataFrame(stripe.to_pandas()) for col in stripe_df: if "minimum" in stripes_stats[stripe_idx][col]: - actual_min = stripe_df[col].min() stats_min = stripes_stats[stripe_idx][col]["minimum"] - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in stripes_stats[stripe_idx][col]: - actual_max = stripe_df[col].max() stats_max = stripes_stats[stripe_idx][col]["maximum"] - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: stats_num_vals = stripes_stats[stripe_idx][col][ "number_of_values" ] - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @@ -733,16 +739,19 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): for col in expect: if "minimum" in file_stats[0][col]: stats_min = file_stats[0][col]["minimum"] - actual_min = expect[col].min() - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = expect[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in file_stats[0][col]: stats_max = file_stats[0][col]["maximum"] - actual_max = expect[col].max() - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = expect[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in file_stats[0][col]: stats_num_vals = file_stats[0][col]["number_of_values"] - actual_num_vals = expect[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = expect[col].count() + assert stats_num_vals == actual_num_vals # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): @@ -751,21 +760,24 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): stripe_df = cudf.DataFrame(stripe.to_pandas()) for col in stripe_df: if "minimum" in stripes_stats[stripe_idx][col]: - actual_min = stripe_df[col].min() stats_min = stripes_stats[stripe_idx][col]["minimum"] - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in stripes_stats[stripe_idx][col]: - actual_max = stripe_df[col].max() stats_max = stripes_stats[stripe_idx][col]["maximum"] - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: stats_num_vals = stripes_stats[stripe_idx][col][ "number_of_values" ] - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals @pytest.mark.parametrize("nrows", [1, 100, 6000000]) From bdc1f3a6e1f383cd689ba8e92903b89e49cdb8d8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:34:29 -0400 Subject: [PATCH 12/23] Expose streams in public strings case APIs (#14056) Add stream parameter to public strings APIs: - `cudf::strings::capitalize()` - `cudf::strings::title()` - `cudf::strings::is_title()` - `cudf::strings::to_lower()` - `cudf::strings::to_upper()` - `cudf::strings::swapcase()` Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14056 --- cpp/include/cudf/strings/capitalize.hpp | 28 ++++++++----- cpp/include/cudf/strings/case.hpp | 8 +++- cpp/src/strings/capitalize.cu | 9 ++-- cpp/src/strings/case.cu | 9 ++-- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/strings/case_test.cpp | 55 +++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 18 deletions(-) create mode 100644 cpp/tests/streams/strings/case_test.cpp diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp index 6d01ab047ba..57375e9ac6a 100644 --- a/cpp/include/cudf/strings/capitalize.hpp +++ b/cpp/include/cudf/strings/capitalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,16 +50,18 @@ namespace strings { * * Any null string entries return corresponding null output column entries. * - * @throw cudf::logic_error if `delimiter.is_valid()` is `false`. + * @throw cudf::logic_error if `delimiter.is_valid()` is `false`. * - * @param input String column. - * @param delimiters Characters for identifying words to capitalize. + * @param input String column + * @param delimiters Characters for identifying words to capitalize + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Column of strings capitalized from the input column. + * @return Column of strings capitalized from the input column */ std::unique_ptr capitalize( strings_column_view const& input, - string_scalar const& delimiters = string_scalar(""), + string_scalar const& delimiters = string_scalar("", true, cudf::get_default_stream()), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,14 +85,16 @@ std::unique_ptr capitalize( * * Any null string entries return corresponding null output column entries. * - * @param input String column. - * @param sequence_type The character type that is used when identifying words. + * @param input String column + * @param sequence_type The character type that is used when identifying words + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Column of titled strings. + * @return Column of titled strings */ std::unique_ptr title( strings_column_view const& input, string_character_types sequence_type = string_character_types::ALPHA, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -112,12 +116,14 @@ std::unique_ptr title( * * Any null string entries result in corresponding null output column entries. * - * @param input String column. + * @param input String column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Column of type BOOL8. + * @return Column of type BOOL8 */ std::unique_ptr is_title( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp index 06ba4f8d882..94191686a92 100644 --- a/cpp/include/cudf/strings/case.hpp +++ b/cpp/include/cudf/strings/case.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,11 +38,13 @@ namespace strings { * Any null entries create null entries in the output column. * * @param strings Strings instance for this operation. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of strings with characters converted. */ std::unique_ptr to_lower( strings_column_view const& strings, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -55,11 +57,13 @@ std::unique_ptr to_lower( * Any null entries create null entries in the output column. * * @param strings Strings instance for this operation. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of strings with characters converted. */ std::unique_ptr to_upper( strings_column_view const& strings, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -73,11 +77,13 @@ std::unique_ptr to_upper( * Any null entries create null entries in the output column. * * @param strings Strings instance for this operation. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of strings with characters converted. */ std::unique_ptr swapcase( strings_column_view const& strings, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index 4e248922702..c555031b588 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -287,25 +287,28 @@ std::unique_ptr is_title(strings_column_view const& input, std::unique_ptr capitalize(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::capitalize(input, delimiter, cudf::get_default_stream(), mr); + return detail::capitalize(input, delimiter, stream, mr); } std::unique_ptr title(strings_column_view const& input, string_character_types sequence_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::title(input, sequence_type, cudf::get_default_stream(), mr); + return detail::title(input, sequence_type, stream, mr); } std::unique_ptr is_title(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_title(input, cudf::get_default_stream(), mr); + return detail::is_title(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index c5fe7a19f53..8f4c2ee574a 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -310,24 +310,27 @@ std::unique_ptr swapcase(strings_column_view const& strings, // APIs std::unique_ptr to_lower(strings_column_view const& strings, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_lower(strings, cudf::get_default_stream(), mr); + return detail::to_lower(strings, stream, mr); } std::unique_ptr to_upper(strings_column_view const& strings, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_upper(strings, cudf::get_default_stream(), mr); + return detail::to_upper(strings, stream, mr); } std::unique_ptr swapcase(strings_column_view const& strings, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::swapcase(strings, cudf::get_default_stream(), mr); + return detail::swapcase(strings, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a69dc9bf2f8..4923ef5c903 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/strings/case_test.cpp b/cpp/tests/streams/strings/case_test.cpp new file mode 100644 index 00000000000..df3eabd773a --- /dev/null +++ b/cpp/tests/streams/strings/case_test.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class StringsCaseTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsCaseTest, LowerUpper) +{ + auto const input = + cudf::test::strings_column_wrapper({"", + "The quick brown fox", + "jumps over the lazy dog.", + "all work and no play makes Jack a dull boy", + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::to_lower(view, cudf::test::get_default_stream()); + cudf::strings::to_upper(view, cudf::test::get_default_stream()); + cudf::strings::swapcase(view, cudf::test::get_default_stream()); +} + +TEST_F(StringsCaseTest, Capitalize) +{ + auto const input = + cudf::test::strings_column_wrapper({"", + "The Quick Brown Fox", + "jumps over the lazy dog", + "all work and no play makes Jack a dull boy"}); + auto view = cudf::strings_column_view(input); + + auto const delimiter = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + cudf::strings::capitalize(view, delimiter, cudf::test::get_default_stream()); + cudf::strings::is_title(view, cudf::test::get_default_stream()); + cudf::strings::title( + view, cudf::strings::string_character_types::ALPHA, cudf::test::get_default_stream()); +} From c016b58b24e63468e9110a6ca82adfc5fd61202d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 19 Sep 2023 07:50:20 -0500 Subject: [PATCH 13/23] Update to clang 16.0.6. (#14120) This PR updates cudf to use clang 16.0.6. The previous version 16.0.1 has some minor formatting issues affecting several RAPIDS repos. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Mark Harris (https://github.com/harrism) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/14120 --- .pre-commit-config.yaml | 2 +- cpp/benchmarks/iterator/iterator.cu | 2 +- .../stream_compaction/apply_boolean_mask.cpp | 4 +- cpp/benchmarks/string/char_types.cpp | 2 +- cpp/benchmarks/string/extract.cpp | 2 +- .../cudf/column/column_device_view.cuh | 2 +- cpp/include/cudf/detail/copy_if.cuh | 2 +- cpp/include/cudf/detail/indexalator.cuh | 4 +- cpp/include/cudf/detail/join.hpp | 4 +- cpp/include/cudf/fixed_point/fixed_point.hpp | 2 +- cpp/include/cudf/groupby.hpp | 4 +- cpp/include/cudf/io/csv.hpp | 2 +- cpp/include/cudf/io/json.hpp | 2 +- cpp/include/cudf/strings/detail/utf8.hpp | 36 ++-- cpp/include/cudf/table/row_operators.cuh | 4 +- cpp/include/cudf/table/table_view.hpp | 2 +- cpp/include/cudf/wrappers/dictionary.hpp | 2 +- cpp/include/cudf_test/base_fixture.hpp | 4 +- cpp/include/nvtext/subword_tokenize.hpp | 2 +- cpp/scripts/run-clang-tidy.py | 2 +- cpp/src/copying/contiguous_split.cu | 8 +- cpp/src/groupby/sort/functors.hpp | 10 +- cpp/src/io/avro/avro_gpu.cu | 2 +- cpp/src/io/comp/cpu_unbz2.cpp | 2 +- cpp/src/io/comp/debrotli.cu | 4 +- cpp/src/io/comp/gpuinflate.cu | 18 +- cpp/src/io/comp/uncomp.cpp | 10 +- cpp/src/io/comp/unsnap.cu | 2 +- cpp/src/io/json/json_column.cu | 2 +- cpp/src/io/json/nested_json_gpu.cu | 160 +++++++++--------- cpp/src/io/orc/orc_gpu.hpp | 2 +- cpp/src/io/orc/stripe_data.cu | 4 +- .../io/parquet/compact_protocol_reader.cpp | 2 +- .../io/parquet/compact_protocol_writer.cpp | 2 +- cpp/src/io/parquet/delta_binary.cuh | 20 +-- cpp/src/io/parquet/page_delta_decode.cu | 2 +- cpp/src/io/parquet/parquet.hpp | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 22 +-- cpp/src/io/parquet/reader_impl_preprocess.cu | 2 +- cpp/src/join/join.cu | 4 +- .../quantiles/tdigest/tdigest_aggregation.cu | 2 +- .../rolling/detail/rolling_collect_list.cuh | 2 +- cpp/src/strings/char_types/char_types.cu | 4 +- cpp/src/strings/convert/convert_datetime.cu | 6 +- cpp/src/strings/convert/convert_durations.cu | 2 +- cpp/src/strings/convert/convert_floats.cu | 6 +- cpp/src/strings/convert/convert_integers.cu | 2 +- cpp/src/strings/convert/convert_ipv4.cu | 2 +- cpp/src/strings/convert/convert_urls.cu | 4 +- cpp/src/strings/json/json_path.cu | 2 +- cpp/src/strings/regex/regcomp.cpp | 14 +- cpp/src/strings/regex/regcomp.h | 8 +- cpp/src/strings/regex/regex.cuh | 18 +- cpp/src/strings/regex/regex.inl | 10 +- cpp/src/strings/replace/replace_re.cu | 2 +- cpp/src/strings/split/partition.cu | 2 +- cpp/src/strings/split/split.cuh | 2 +- cpp/src/strings/split/split_re.cu | 2 +- cpp/src/strings/utilities.cu | 6 +- cpp/src/text/normalize.cu | 4 +- cpp/src/text/replace.cu | 2 +- cpp/src/text/subword/bpe_tokenizer.cu | 2 +- cpp/src/text/subword/load_merges_file.cu | 2 +- cpp/src/text/utilities/tokenize_ops.cuh | 2 +- cpp/tests/groupby/merge_lists_tests.cpp | 2 +- cpp/tests/groupby/merge_sets_tests.cpp | 12 +- cpp/tests/io/parquet_test.cpp | 6 +- cpp/tests/lists/reverse_tests.cpp | 8 +- .../difference_distinct_tests.cpp | 2 +- .../intersect_distinct_tests.cpp | 4 +- .../set_operations/union_distinct_tests.cpp | 4 +- .../stream_compaction/distinct_tests.cpp | 10 +- .../reshape/interleave_columns_tests.cpp | 2 +- .../rolling/range_rolling_window_test.cpp | 2 +- cpp/tests/sort/segmented_sort_tests.cpp | 2 +- cpp/tests/strings/chars_types_tests.cpp | 12 +- cpp/tests/strings/durations_tests.cpp | 8 +- cpp/tests/utilities/column_utilities.cu | 2 +- 78 files changed, 276 insertions(+), 276 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 238e5b44030..7e44091774f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,7 +63,7 @@ repos: # Explicitly specify the pyproject.toml at the repo root, not per-project. args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v16.0.1 + rev: v16.0.6 hooks: - id: clang-format types_or: [c, c++, cuda] diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu index 7acf24c30a5..dcd13cf62c4 100644 --- a/cpp/benchmarks/iterator/iterator.cu +++ b/cpp/benchmarks/iterator/iterator.cu @@ -145,7 +145,7 @@ void BM_iterator(benchmark::State& state) cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 if (cub_or_thrust) { if (raw_or_iterator) { - raw_stream_bench_cub(hasnull_F, dev_result); // driven by raw pointer + raw_stream_bench_cub(hasnull_F, dev_result); // driven by raw pointer } else { iterator_bench_cub(hasnull_F, dev_result); // driven by riterator without nulls } diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index a6feaf04842..f78aa9fa654 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -59,8 +59,8 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns) int64_t const column_bytes_in = column_bytes_out; // we only read unmasked inputs int64_t const bytes_read = - (column_bytes_in + validity_bytes_in) * num_columns + // reading columns - mask_size; // reading boolean mask + (column_bytes_in + validity_bytes_in) * num_columns + // reading columns + mask_size; // reading boolean mask int64_t const bytes_written = (column_bytes_out + validity_bytes_out) * num_columns; // writing columns diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index 8e9e595fcef..59e6245fd41 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -43,7 +43,7 @@ static void bench_char_types(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well auto chars_size = input.chars_size(); - state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_reads(chars_size); // all bytes are read; if (api_type == "all") { state.add_global_memory_writes(num_rows); // output is a bool8 per row } else { diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index 9e67c5a5b52..135dadabbe4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -43,7 +43,7 @@ static void bench_extract(nvbench::state& state) std::uniform_int_distribution words_dist(0, 999); std::vector samples(100); // 100 unique rows of data to reuse std::generate(samples.begin(), samples.end(), [&]() { - std::string row; // build a row of random tokens + std::string row; // build a row of random tokens while (static_cast(row.size()) < row_width) { row += std::to_string(words_dist(generator)) + " "; } diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 05ef21bd750..35851a99822 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1393,7 +1393,7 @@ struct pair_accessor { */ template struct pair_rep_accessor { - column_device_view const col; ///< column view of column in device + column_device_view const col; ///< column view of column in device using rep_type = device_storage_type_t; ///< representation type diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 1dd91dcd865..ebe7e052b6d 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -133,7 +133,7 @@ __launch_bounds__(block_size) __global__ if (has_validity) { temp_valids[threadIdx.x] = false; // init shared memory if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false; - __syncthreads(); // wait for init + __syncthreads(); // wait for init } if (mask_true) { diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 0ab9da0dbd0..4731c4919e3 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -248,7 +248,7 @@ struct input_indexalator : base_indexalator { friend struct indexalator_factory; friend struct base_indexalator; // for CRTP - using reference = size_type const; // this keeps STL and thrust happy + using reference = size_type const; // this keeps STL and thrust happy input_indexalator() = default; input_indexalator(input_indexalator const&) = default; @@ -332,7 +332,7 @@ struct output_indexalator : base_indexalator { friend struct indexalator_factory; friend struct base_indexalator; // for CRTP - using reference = output_indexalator const&; // required for output iterators + using reference = output_indexalator const&; // required for output iterators output_indexalator() = default; output_indexalator(output_indexalator const&) = default; diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp index 6fcf10aef57..b69632c83ca 100644 --- a/cpp/include/cudf/detail/join.hpp +++ b/cpp/include/cudf/detail/join.hpp @@ -78,8 +78,8 @@ struct hash_join { cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal cudf::table_view _build; ///< input table to build the hash map std::shared_ptr - _preprocessed_build; ///< input table preprocssed for row operators - map_type _hash_table; ///< hash table built on `_build` + _preprocessed_build; ///< input table preprocssed for row operators + map_type _hash_table; ///< hash table built on `_build` public: /** diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 7c59c2f9194..13d8716c1df 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -829,5 +829,5 @@ using decimal32 = fixed_point; ///< 32-bit decima using decimal64 = fixed_point; ///< 64-bit decimal fixed point using decimal128 = fixed_point<__int128_t, Radix::BASE_10>; ///< 128-bit decimal fixed point -/** @} */ // end of group +/** @} */ // end of group } // namespace numeric diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 6e575685daa..1c31e8777a8 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -386,8 +386,8 @@ class groupby { ///< indicates null order ///< of each column std::unique_ptr - _helper; ///< Helper object - ///< used by sort based implementation + _helper; ///< Helper object + ///< used by sort based implementation /** * @brief Get the sort helper object diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index c84ca7e6c73..b49a13a8ea9 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -213,7 +213,7 @@ class csv_reader_options { auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; - auto const base_padding = 1024; // 1KB + auto const base_padding = 1024; // 1KB if (num_columns == 0) { // Use flat size if the number of columns is not known diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 15dc2a614ad..d408d249a7f 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -207,7 +207,7 @@ class json_reader_options { auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; - auto const base_padding = 1024; // 1KB + auto const base_padding = 1024; // 1KB if (num_columns == 0) { // Use flat size if the number of columns is not known diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index df8e2885782..e04572535de 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -155,18 +155,18 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str) constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) { uint32_t unchr = 0; - if (utf8_char < 0x0000'0080) // single-byte pass thru + if (utf8_char < 0x0000'0080) // single-byte pass thru unchr = utf8_char; - else if (utf8_char < 0x0000'E000) // two bytes + else if (utf8_char < 0x0000'E000) // two bytes { - unchr = (utf8_char & 0x1F00) >> 2; // shift and - unchr |= (utf8_char & 0x003F); // unmask - } else if (utf8_char < 0x00F0'0000) // three bytes + unchr = (utf8_char & 0x1F00) >> 2; // shift and + unchr |= (utf8_char & 0x003F); // unmask + } else if (utf8_char < 0x00F0'0000) // three bytes { - unchr = (utf8_char & 0x0F'0000) >> 4; // get upper 4 bits - unchr |= (utf8_char & 0x00'3F00) >> 2; // shift and - unchr |= (utf8_char & 0x00'003F); // unmask - } else if (utf8_char <= 0xF800'0000u) // four bytes + unchr = (utf8_char & 0x0F'0000) >> 4; // get upper 4 bits + unchr |= (utf8_char & 0x00'3F00) >> 2; // shift and + unchr |= (utf8_char & 0x00'003F); // unmask + } else if (utf8_char <= 0xF800'0000u) // four bytes { unchr = (utf8_char & 0x0300'0000) >> 6; // upper 3 bits unchr |= (utf8_char & 0x003F'0000) >> 4; // next 6 bits @@ -185,20 +185,20 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; - if (unchr < 0x0000'0080) // single byte utf8 + if (unchr < 0x0000'0080) // single byte utf8 utf8 = unchr; - else if (unchr < 0x0000'0800) // double byte utf8 + else if (unchr < 0x0000'0800) // double byte utf8 { - utf8 = (unchr << 2) & 0x1F00; // shift bits for - utf8 |= (unchr & 0x3F); // utf8 encoding + utf8 = (unchr << 2) & 0x1F00; // shift bits for + utf8 |= (unchr & 0x3F); // utf8 encoding utf8 |= 0x0000'C080; - } else if (unchr < 0x0001'0000) // triple byte utf8 + } else if (unchr < 0x0001'0000) // triple byte utf8 { - utf8 = (unchr << 4) & 0x0F'0000; // upper 4 bits - utf8 |= (unchr << 2) & 0x00'3F00; // next 6 bits - utf8 |= (unchr & 0x3F); // last 6 bits + utf8 = (unchr << 4) & 0x0F'0000; // upper 4 bits + utf8 |= (unchr << 2) & 0x00'3F00; // next 6 bits + utf8 |= (unchr & 0x3F); // last 6 bits utf8 |= 0x00E0'8080; - } else if (unchr < 0x0011'0000) // quadruple byte utf8 + } else if (unchr < 0x0011'0000) // quadruple byte utf8 { utf8 = (unchr << 6) & 0x0700'0000; // upper 3 bits utf8 |= (unchr << 4) & 0x003F'0000; // next 6 bits diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 599a85c8a54..4806f96c934 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -105,9 +105,9 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord { if (lhs_is_null and rhs_is_null) { // null (dictionary_wrapper const& lhs, using dictionary32 = dictionary_wrapper; ///< 32-bit integer indexed dictionary wrapper -/** @} */ // end of group +/** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index b622d7c6b78..06aabbe4e9c 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -331,9 +331,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv) cxxopts::Options options(argv[0], " - cuDF tests command line options"); char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE"); // Overridden by CLI options char const* env_stream_mode = - std::getenv("GTEST_CUDF_STREAM_MODE"); // Overridden by CLI options + std::getenv("GTEST_CUDF_STREAM_MODE"); // Overridden by CLI options char const* env_stream_error_mode = - std::getenv("GTEST_CUDF_STREAM_ERROR_MODE"); // Overridden by CLI options + std::getenv("GTEST_CUDF_STREAM_ERROR_MODE"); // Overridden by CLI options auto default_rmm_mode = env_rmm_mode ? env_rmm_mode : "pool"; auto default_stream_mode = env_stream_mode ? env_stream_mode : "default"; auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error"; diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index ac75f5e9147..72a899d70b4 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -44,7 +44,7 @@ struct hashed_vocabulary { std::unique_ptr bin_offsets; ///< uint16 column, containing the start index of each ///< bin in the flattened hash table std::unique_ptr - cp_metadata; ///< uint32 column, The code point metadata table to use for normalization + cp_metadata; ///< uint32 column, The code point metadata table to use for normalization std::unique_ptr aux_cp_table; ///< uint64 column, The auxiliary code point table to use for normalization }; diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py index a617a4c0df7..e5e57dbf562 100644 --- a/cpp/scripts/run-clang-tidy.py +++ b/cpp/scripts/run-clang-tidy.py @@ -22,7 +22,7 @@ import shutil -EXPECTED_VERSION = "16.0.1" +EXPECTED_VERSION = "16.0.6" VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") SPACES = re.compile(r"\s+") diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index e1a55ec5419..5ea56a05dcb 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -114,8 +114,8 @@ struct dst_buf_info { int bit_shift; // # of bits to shift right by (for validity buffers) size_type valid_count; // validity count for this block of work - int src_buf_index; // source buffer index - int dst_buf_index; // destination buffer index + int src_buf_index; // source buffer index + int dst_buf_index; // destination buffer index }; /** @@ -1384,7 +1384,7 @@ struct chunk_iteration_state { std::size_t starting_batch; ///< Starting batch index for the current iteration std::vector const h_num_buffs_per_iteration; ///< The count of batches per iteration std::vector const - h_size_of_buffs_per_iteration; ///< The size in bytes per iteration + h_size_of_buffs_per_iteration; ///< The size in bytes per iteration }; std::unique_ptr chunk_iteration_state::create( @@ -1989,7 +1989,7 @@ struct contiguous_split_state { // This can be 1 if `contiguous_split` is just packing and not splitting std::size_t const num_partitions; ///< The number of partitions to produce - size_type const num_src_bufs; ///< Number of source buffers including children + size_type const num_src_bufs; ///< Number of source buffers including children std::size_t const num_bufs; ///< Number of source buffers including children * number of splits diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp index c378ac99727..be36956b929 100644 --- a/cpp/src/groupby/sort/functors.hpp +++ b/cpp/src/groupby/sort/functors.hpp @@ -94,12 +94,12 @@ struct store_result_functor { }; protected: - sort::sort_groupby_helper& helper; ///< Sort helper - cudf::detail::result_cache& cache; ///< cache of results to store into - column_view const& values; ///< Column of values to group and aggregate + sort::sort_groupby_helper& helper; ///< Sort helper + cudf::detail::result_cache& cache; ///< cache of results to store into + column_view const& values; ///< Column of values to group and aggregate - rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels - rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results + rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels + rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results sorted keys_are_sorted; ///< Whether the keys are sorted std::unique_ptr sorted_values; ///< Memoised grouped and sorted values diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index 2c634d9b590..365f6d6875c 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -303,7 +303,7 @@ avro_decode_row(schemadesc_s const* schema, // If within an array, check if we reached the last item if (array_repeat_count != 0 && array_children <= 0 && cur < end) { if (!--array_repeat_count) { - i = array_start; // Restart at the array parent + i = array_start; // Restart at the array parent } else { i = array_start + 1; // Restart after the array parent array_children = schema[array_start].count; diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp index 7159ff30d7c..a116335b254 100644 --- a/cpp/src/io/comp/cpu_unbz2.cpp +++ b/cpp/src/io/comp/cpu_unbz2.cpp @@ -216,7 +216,7 @@ int32_t bz2_decompress_block(unbz_state_s* s) s->currBlockNo++; - skipbits(s, 32); // block CRC + skipbits(s, 32); // block CRC if (getbits(s, 1)) return BZ_DATA_ERROR; // blockRandomized not supported (old bzip versions) diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 542ca031b7c..8bafd054bdb 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -121,7 +121,7 @@ __inline__ __device__ int brotli_context(int p1, int p2, int lut) struct huff_scratch_s { uint16_t code_length_histo[16]; uint8_t code_length_code_lengths[brotli_code_length_codes]; - int8_t offset[6]; // offsets in sorted table for each length + int8_t offset[6]; // offsets in sorted table for each length uint16_t lenvlctab[32]; uint16_t sorted[brotli_code_length_codes]; // symbols sorted by code length int16_t next_symbol[32]; @@ -1298,7 +1298,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t* // Reinitialize elements that could have been changed. uint32_t i = 1; uint32_t upper_bound = s->mtf_upper_bound; - uint32_t* mtf = &s->mtf[1]; // Make mtf[-1] addressable. + uint32_t* mtf = &s->mtf[1]; // Make mtf[-1] addressable. auto* mtf_u8 = reinterpret_cast(mtf); uint32_t pattern = 0x0302'0100; // Little-endian diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 42c4fbe7bea..8993815e560 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -124,11 +124,11 @@ struct inflate_state_s { uint8_t* outbase; ///< start of output buffer uint8_t* outend; ///< end of output buffer // Input state - uint8_t const* cur; ///< input buffer - uint8_t const* end; ///< end of input buffer + uint8_t const* cur; ///< input buffer + uint8_t const* end; ///< end of input buffer - uint2 bitbuf; ///< bit buffer (64-bit) - uint32_t bitpos; ///< position in bit buffer + uint2 bitbuf; ///< bit buffer (64-bit) + uint32_t bitpos; ///< position in bit buffer int32_t err; ///< Error status int btype; ///< current block type @@ -295,7 +295,7 @@ __device__ int construct( return 0; // complete, but decode() will fail // check for an over-subscribed or incomplete set of lengths - left = 1; // one possible code of zero length + left = 1; // one possible code of zero length for (len = 1; len <= max_bits; len++) { left <<= 1; // one more bit, double codes left left -= counts[len]; // deduct count from possible codes @@ -349,8 +349,8 @@ __device__ int init_dynamic(inflate_state_s* s) index = 0; while (index < nlen + ndist) { int symbol = decode(s, s->lencnt, s->lensym); - if (symbol < 0) return symbol; // invalid symbol - if (symbol < 16) // length in 0..15 + if (symbol < 0) return symbol; // invalid symbol + if (symbol < 16) // length in 0..15 lengths[index++] = symbol; else { // repeat instruction int len = 0; // last length to repeat, assume repeating zeros @@ -358,9 +358,9 @@ __device__ int init_dynamic(inflate_state_s* s) if (index == 0) return -5; // no last length! len = lengths[index - 1]; // last length symbol = 3 + getbits(s, 2); - } else if (symbol == 17) // repeat zero 3..10 times + } else if (symbol == 17) // repeat zero 3..10 times symbol = 3 + getbits(s, 3); - else // == 18, repeat zero 11..138 times + else // == 18, repeat zero 11..138 times symbol = 11 + getbits(s, 7); if (index + symbol > nlen + ndist) return -6; // too many lengths! while (symbol--) // repeat last or zero symbol times diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 017fd8abb47..0d2d21333bb 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -28,7 +28,7 @@ #include // memset -#include // uncompress +#include // uncompress using cudf::host_span; @@ -47,7 +47,7 @@ struct gz_file_header_s { uint8_t os; // OS id }; -struct zip_eocd_s // end of central directory +struct zip_eocd_s // end of central directory { uint32_t sig; // 0x0605'4b50 uint16_t disk_id; // number of this disk @@ -59,7 +59,7 @@ struct zip_eocd_s // end of central directory // number uint16_t comment_len; // comment length (excluded from struct) }; -struct zip64_eocdl // end of central dir locator +struct zip64_eocdl // end of central dir locator { uint32_t sig; // 0x0706'4b50 uint32_t disk_start; // number of the disk with the start of the zip64 end of central directory @@ -67,7 +67,7 @@ struct zip64_eocdl // end of central dir locator uint32_t num_disks; // total number of disks }; -struct zip_cdfh_s // central directory file header +struct zip_cdfh_s // central directory file header { uint32_t sig; // 0x0201'4b50 uint16_t ver; // version made by @@ -111,7 +111,7 @@ struct bz2_file_header_s { struct gz_archive_s { gz_file_header_s const* fhdr; - uint16_t hcrc16; // header crc16 if present + uint16_t hcrc16; // header crc16 if present uint16_t xlen; uint8_t const* fxtra; // xlen bytes (optional) uint8_t const* fname; // zero-terminated original filename if present diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index a7a1cfd3f9e..c699502317f 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -45,7 +45,7 @@ void __device__ busy_wait(size_t cycles) struct unsnap_batch_s { int32_t len; // 1..64 = Number of bytes uint32_t - offset; // copy distance if greater than zero or negative of literal offset in byte stream + offset; // copy distance if greater than zero or negative of literal offset in byte stream }; /** diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index bdad16bd9f1..cabf904f020 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -169,7 +169,7 @@ reduce_to_column_tree(tree_meta_t& tree, }); // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector column_levels(0, stream); // not required + rmm::device_uvector column_levels(0, stream); // not required rmm::device_uvector parent_col_ids(num_columns, stream); rmm::device_uvector col_range_begin(num_columns, stream); // Field names rmm::device_uvector col_range_end(num_columns, stream); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b691eaa8caf..0b49f97597d 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -762,18 +762,18 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({}), // LINE_BREAK {ValueBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BOA)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ {StructBegin}, // OPENING_BRACE {ListBegin}, // OPENING_BRACKET @@ -799,18 +799,18 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_LON)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}, // OTHER /*LIST*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -824,17 +824,17 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({ValueEnd}), // LINE_BREAK {}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd, StructMemberEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}}}; // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd, StructMemberEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ {}, // OPENING_BRACE @@ -974,17 +974,17 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({ErrorBegin}), // LINE_BREAK {ErrorBegin}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {StructMemberBegin, FieldNameBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}}}; // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StructMemberBegin, FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ {ErrorBegin}, // OPENING_BRACE @@ -1011,17 +1011,17 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({ErrorBegin}), // LINE_BREAK {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ {ErrorBegin}, // OPENING_BRACE @@ -1048,17 +1048,17 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({ErrorBegin}), // LINE_BREAK {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ {ErrorBegin}, // OPENING_BRACE @@ -1097,18 +1097,18 @@ auto get_translation_table(bool include_line_delimiter) nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*LIST*/ {}, // OPENING_BRACE {}, // OPENING_BRACKET diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 681cc0fb9d2..9b8df50a22a 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -157,7 +157,7 @@ struct EncChunk { uint8_t dtype_len; // data type length int32_t scale; // scale for decimals or timestamps - uint32_t* dict_index; // dictionary index from row index + uint32_t* dict_index; // dictionary index from row index uint32_t* decimal_offsets; orc_column_device_view const* column; }; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index b66ca827119..3edcd3d83b2 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -367,14 +367,14 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) if (zbit) { return 5 + (zbit >> 3); // up to 9x7 bits } else if ((sizeof(T) <= 8) || (bytestream_readbyte(bs, pos + 9) <= 0x7f)) { - return 10; // up to 70 bits + return 10; // up to 70 bits } else { uint64_t next64 = bytestream_readu64(bs, pos + 10); zbit = __ffsll((~next64) & 0x8080'8080'8080'8080ull); if (zbit) { return 10 + (zbit >> 3); // Up to 18x7 bits (126) } else { - return 19; // Up to 19x7 bits (133) + return 19; // Up to 19x7 bits (133) } } } diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 92fcd151925..ae11af92f78 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -168,7 +168,7 @@ bool CompactProtocolReader::read(LogicalType* l) ParquetFieldUnion(2, l->isset.MAP, l->MAP), ParquetFieldUnion(3, l->isset.LIST, l->LIST), ParquetFieldUnion(4, l->isset.ENUM, l->ENUM), - ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL), // read the struct + ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL), // read the struct ParquetFieldUnion(6, l->isset.DATE, l->DATE), ParquetFieldUnion(7, l->isset.TIME, l->TIME), // read the struct ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP), // read the struct diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index b2a89129645..b2c0c97c52d 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -315,7 +315,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) if constexpr (not std::is_empty_v) { writer.write(val); // write the struct if it's not empty } else { - put_byte(0); // otherwise, add a stop field + put_byte(0); // otherwise, add a stop field } current_field_value = field; } diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 4fc8b9cfb8e..2382e4aafdf 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -90,16 +90,16 @@ inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end) } struct delta_binary_decoder { - uint8_t const* block_start; // start of data, but updated as data is read - uint8_t const* block_end; // end of data - uleb128_t block_size; // usually 128, must be multiple of 128 - uleb128_t mini_block_count; // usually 4, chosen such that block_size/mini_block_count is a - // multiple of 32 - uleb128_t value_count; // total values encoded in the block - zigzag128_t last_value; // last value decoded, initialized to first_value from header - - uint32_t values_per_mb; // block_size / mini_block_count, must be multiple of 32 - uint32_t current_value_idx; // current value index, initialized to 0 at start of block + uint8_t const* block_start; // start of data, but updated as data is read + uint8_t const* block_end; // end of data + uleb128_t block_size; // usually 128, must be multiple of 128 + uleb128_t mini_block_count; // usually 4, chosen such that block_size/mini_block_count is a + // multiple of 32 + uleb128_t value_count; // total values encoded in the block + zigzag128_t last_value; // last value decoded, initialized to first_value from header + + uint32_t values_per_mb; // block_size / mini_block_count, must be multiple of 32 + uint32_t current_value_idx; // current value index, initialized to 0 at start of block zigzag128_t cur_min_delta; // min delta for the block uint32_t cur_mb; // index of the current mini-block within the block diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index e79a479388f..35f33a761be 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -85,7 +85,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( if (t < 2 * warp_size) { // warp0..1 target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size); - } else { // warp2 + } else { // warp2 target_pos = min(s->nz_count, src_pos + batch_size); } __syncthreads(); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index a729f28d672..f7318bb9935 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -365,8 +365,8 @@ struct ColumnIndex { std::vector> min_values; // lower bound for values in each page std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = - BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered - std::vector null_counts; // Optional count of null values per page + BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered + std::vector null_counts; // Optional count of null values per page }; // bit space we are reserving in column_buffer::user_data diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index e82b6abc13d..a3cc37dee4f 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -299,7 +299,7 @@ struct ColumnChunkDesc { int8_t converted_type; // converted type enum LogicalType logical_type; // logical type int8_t decimal_precision; // Decimal precision - int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) + int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index; // my input column index int32_t src_col_schema; // my schema index in the file @@ -396,16 +396,16 @@ constexpr uint32_t encoding_to_mask(Encoding encoding) struct EncColumnChunk { parquet_column_device_view const* col_desc; //!< Column description size_type col_desc_id; - PageFragment* fragments; //!< First fragment in chunk - uint8_t* uncompressed_bfr; //!< Uncompressed page data - uint8_t* compressed_bfr; //!< Compressed page data - statistics_chunk const* stats; //!< Fragment statistics - uint32_t bfr_size; //!< Uncompressed buffer size - uint32_t compressed_size; //!< Compressed buffer size - uint32_t max_page_data_size; //!< Max data size (excluding header) of any page in this chunk - uint32_t page_headers_size; //!< Sum of size of all page headers - size_type start_row; //!< First row of chunk - uint32_t num_rows; //!< Number of rows in chunk + PageFragment* fragments; //!< First fragment in chunk + uint8_t* uncompressed_bfr; //!< Uncompressed page data + uint8_t* compressed_bfr; //!< Compressed page data + statistics_chunk const* stats; //!< Fragment statistics + uint32_t bfr_size; //!< Uncompressed buffer size + uint32_t compressed_size; //!< Compressed buffer size + uint32_t max_page_data_size; //!< Max data size (excluding header) of any page in this chunk + uint32_t page_headers_size; //!< Sum of size of all page headers + size_type start_row; //!< First row of chunk + uint32_t num_rows; //!< Number of rows in chunk size_type num_values; //!< Number of values in chunk. Different from num_rows for nested types uint32_t first_fragment; //!< First fragment of chunk EncPage* pages; //!< Ptr to pages that belong to this chunk diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index bde73c3dd96..a2db0de26bb 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1673,7 +1673,7 @@ void reader::impl::preprocess_pages(size_t skip_rows, // - we will be doing a chunked read gpu::ComputePageSizes(pages, chunks, - 0, // 0-max size_t. process all possible rows + 0, // 0-max size_t. process all possible rows std::numeric_limits::max(), true, // compute num_rows chunk_read_limit > 0, // compute string sizes diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 8210f3114d6..ae025b1a213 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -73,7 +73,7 @@ left_join(table_view const& left_input, // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input, right_input}, // these should match + {left_input, right_input}, // these should match stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones @@ -98,7 +98,7 @@ full_join(table_view const& left_input, // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input, right_input}, // these should match + {left_input, right_input}, // these should match stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 2ce55e10fb1..9e8b75ae3b6 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -459,7 +459,7 @@ __global__ void generate_cluster_limits_kernel(int delta, int adjusted_w_index = nearest_w_index; if ((last_inserted_index < 0) || // if we haven't inserted anything yet (nearest_w_index == - last_inserted_index)) { // if we land in the same bucket as the previous cap + last_inserted_index)) { // if we land in the same bucket as the previous cap // force the value into this bucket adjusted_w_index = (last_inserted_index == group_size - 1) diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh index 9f74a961e12..39d15ed716f 100644 --- a/cpp/src/rolling/detail/rolling_collect_list.cuh +++ b/cpp/src/rolling/detail/rolling_collect_list.cuh @@ -116,7 +116,7 @@ std::unique_ptr create_collect_gather_map(column_view const& child_offse thrust::make_counting_iterator(per_row_mapping.size()), gather_map->mutable_view().template begin(), [d_offsets = - child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] + child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] d_groups = per_row_mapping.template begin(), // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] d_prev = preceding_iter] __device__(auto i) { diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index b87fb80fcc2..0c0ad0ad29e 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -139,9 +139,9 @@ struct filter_chars_fn { { auto const code_point = detail::utf8_to_codepoint(ch); auto const flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - if (flag == 0) // all types pass unless specifically identified + if (flag == 0) // all types pass unless specifically identified return (types_to_remove == ALL_TYPES); - if (types_to_keep == ALL_TYPES) // filter case + if (types_to_keep == ALL_TYPES) // filter case return (types_to_remove & flag) != 0; return (types_to_keep & flag) == 0; // keep case } diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index cca06ca0739..8a953d778ed 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -317,8 +317,8 @@ struct parse_datetime { bytes_read -= left; break; } - case 'u': [[fallthrough]]; // day of week: Mon(1)-Sat(6),Sun(7) - case 'w': { // day of week; Sun(0),Mon(1)-Sat(6) + case 'u': [[fallthrough]]; // day of week: Mon(1)-Sat(6),Sun(7) + case 'w': { // day of week; Sun(0),Mon(1)-Sat(6) auto const [weekday, left] = parse_int(ptr, item.length); timeparts.weekday = // 0 is mapped to 7 for chrono library static_cast((item.value == 'w' && weekday == 0) ? 7 : weekday); @@ -1000,7 +1000,7 @@ struct datetime_formatter_fn { case 'S': // second copy_value = timeparts.second; break; - case 'f': // sub-second + case 'f': // sub-second { char subsecond_digits[] = "000000000"; // 9 max digits int const digits = [] { diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 863f76b9b98..6ab70825a6b 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -576,7 +576,7 @@ struct parse_duration { item_length++; // : timeparts->second = parse_second(ptr + item_length, item_length); break; - case 'r': // hh:MM:SS AM/PM + case 'r': // hh:MM:SS AM/PM timeparts->hour = parse_hour(ptr, item_length); item_length++; // : timeparts->minute = parse_minute(ptr + item_length, item_length); diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index ab1e6870937..32167589ab4 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -284,7 +284,7 @@ struct ftos_converter { while (pb != buffer) // reverses the digits *ptr++ = *--pb; // e.g. 54321 -> 12345 } else - *ptr++ = '0'; // always include at least .0 + *ptr++ = '0'; // always include at least .0 // exponent if (exp10) { *ptr++ = 'e'; @@ -310,7 +310,7 @@ struct ftos_converter { { if (std::isnan(value)) return 3; // NaN bool bneg = false; - if (signbit(value)) { // handles -0.0 too + if (signbit(value)) { // handles -0.0 too value = -value; bneg = true; } @@ -337,7 +337,7 @@ struct ftos_converter { ++count; // always include .0 // exponent if (exp10) { - count += 2; // 'e±' + count += 2; // 'e±' if (exp10 < 0) exp10 = -exp10; count += (int)(exp10 < 10); // padding while (exp10 > 0) { diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 260c3393f3c..5597d2831c0 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -76,7 +76,7 @@ struct string_to_integer_check_fn { auto const digit = static_cast(chr - '0'); auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign; if (value > bound_check) return false; - value = value* IntegerType{10} + digit; + value = value * IntegerType{10} + digit; } return true; diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index 4606aba6d17..adb72cb0263 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -197,7 +197,7 @@ std::unique_ptr is_ipv4(strings_column_view const& strings, if (d_str.empty()) return false; constexpr int max_ip = 255; // values must be in [0,255] int ip_vals[4] = {-1, -1, -1, -1}; - int ipv_idx = 0; // index into ip_vals + int ipv_idx = 0; // index into ip_vals for (auto const ch : d_str) { if ((ch >= '0') && (ch <= '9')) { auto const ip_val = ip_vals[ipv_idx]; diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 71b6c09310e..9efa148cfd2 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -107,9 +107,9 @@ struct url_encoder_fn { out_ptr = copy_and_increment(out_ptr, hex, 2); // add them to the output } } - } else // these are to be utf-8 url-encoded + } else // these are to be utf-8 url-encoded { - uint8_t char_bytes[4]; // holds utf-8 bytes for one character + uint8_t char_bytes[4]; // holds utf-8 bytes for one character size_type char_width = from_char_utf8(ch, reinterpret_cast(char_bytes)); nbytes += char_width * 3; // '%' plus 2 hex chars per byte (example: é is %C3%A9) // process each byte in this current character diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 2d2691e0518..c56752f5429 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -984,7 +984,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c col.size(), rmm::device_buffer{0, stream, mr}, // no data cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), - col.size()); // null count + col.size()); // null count } constexpr int block_size = 512; diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 5fd098a872e..b7a7f19369d 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -184,9 +184,9 @@ class regex_parser { int32_t _id_cclass_d{-1}; // digits [0-9] int32_t _id_cclass_D{-1}; // not digits - char32_t _chr{}; // last lex'd char - int32_t _cclass_id{}; // last lex'd class - int16_t _min_count{}; // data for counted operators + char32_t _chr{}; // last lex'd char + int32_t _cclass_id{}; // last lex'd class + int16_t _min_count{}; // data for counted operators int16_t _max_count{}; std::vector _items; @@ -361,9 +361,9 @@ class regex_parser { auto [q, n_chr] = next_char(); if (n_chr == 0) { return 0; } // malformed: '[x-' - if (!q && n_chr == ']') { // handles: '[x-]' + if (!q && n_chr == ']') { // handles: '[x-]' literals.push_back(chr); - literals.push_back(chr); // add '-' as literal + literals.push_back(chr); // add '-' as literal break; } // normal case: '[a-z]' @@ -749,7 +749,7 @@ class regex_parser { // infinite repeats if (n > 0) { // append '+' after last repetition out.push_back(regex_parser::Item{item.type == COUNTED ? PLUS : PLUS_LAZY, 0}); - } else { // copy it once then append '*' + } else { // copy it once then append '*' out.insert(out.end(), begin, end); out.push_back(regex_parser::Item{item.type == COUNTED ? STAR : STAR_LAZY, 0}); } @@ -1095,7 +1095,7 @@ void reprog::build_start_ids() ids.pop(); reinst const& inst = _insts[id]; if (inst.type == OR) { - if (inst.u2.left_id != id) // prevents infinite while-loop here + if (inst.u2.left_id != id) // prevents infinite while-loop here ids.push(inst.u2.left_id); if (inst.u1.right_id != id) // prevents infinite while-loop here ids.push(inst.u1.right_id); diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index aa2cb363b80..ab912ace0df 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -77,16 +77,16 @@ constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n' * @brief Structure of an encoded regex instruction */ struct reinst { - int32_t type; /* operator type or instruction type */ + int32_t type; /* operator type or instruction type */ union { int32_t cls_id; /* class pointer */ char32_t c; /* character */ int32_t subid; /* sub-expression id for RBRA and LBRA */ int32_t right_id; /* right child of OR */ } u1; - union { /* regexec relies on these two being in the same union */ - int32_t left_id; /* left child of OR */ - int32_t next_id; /* next instruction for CAT & LBRA */ + union { /* regexec relies on these two being in the same union */ + int32_t left_id; /* left child of OR */ + int32_t next_id; /* next instruction for CAT & LBRA */ } u2; int32_t reserved4; }; diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 19d82380350..c1abbd78b43 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -253,21 +253,21 @@ class reprog_device { reprog_device(reprog const&); - int32_t _startinst_id; // first instruction id - int32_t _num_capturing_groups; // instruction groups - int32_t _insts_count; // number of instructions - int32_t _starts_count; // number of start-insts ids - int32_t _classes_count; // number of classes - int32_t _max_insts; // for partitioning working memory + int32_t _startinst_id; // first instruction id + int32_t _num_capturing_groups; // instruction groups + int32_t _insts_count; // number of instructions + int32_t _starts_count; // number of start-insts ids + int32_t _classes_count; // number of classes + int32_t _max_insts; // for partitioning working memory uint8_t const* _codepoint_flags{}; // table of character types reinst const* _insts{}; // array of regex instructions int32_t const* _startinst_ids{}; // array of start instruction ids reclass_device const* _classes{}; // array of regex classes - std::size_t _prog_size{}; // total size of this instance - void* _buffer{}; // working memory buffer - int32_t _thread_count{}; // threads available in working memory + std::size_t _prog_size{}; // total size of this instance + void* _buffer{}; // working memory buffer + int32_t _thread_count{}; // threads available in working memory }; /** diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index c5205ae7789..ce12dc17aa4 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -146,17 +146,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch, uint32_t codept = utf8_to_codepoint(ch); if (codept > 0x00'FFFF) return false; int8_t fl = codepoint_flags[codept]; - if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w + if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w return true; - if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s + if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s return true; - if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d + if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d return true; if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W return true; - if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S + if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S return true; - if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D + if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D return true; // return false; diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 460074a5296..81ddb937be5 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -68,7 +68,7 @@ struct replace_regex_fn { if (!match) { break; } // no more matches auto const [start_pos, end_pos] = match_positions_to_bytes(*match, d_str, last_pos); - nbytes += d_repl.size_bytes() - (end_pos - start_pos); // add new size + nbytes += d_repl.size_bytes() - (end_pos - start_pos); // add new size if (out_ptr) { // replace: // i:bbbbsssseeee diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index 099f5978992..0c7d119ea38 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -170,7 +170,7 @@ struct rpartition_fn : public partition_fn { --itr; pos = check_delimiter(idx, d_str, itr); } - if (pos < 0) // delimiter not found + if (pos < 0) // delimiter not found { d_indices_left[idx] = string_index_pair{"", 0}; // two empty d_indices_delim[idx] = string_index_pair{"", 0}; // strings diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index e76d8ac1c60..dc0b04af388 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -190,7 +190,7 @@ struct split_tokenizer_fn : base_split_tokenizer { device_span d_delimiters, device_span d_tokens) const { - auto const base_ptr = get_base_ptr(); // d_positions values based on this + auto const base_ptr = get_base_ptr(); // d_positions values based on this auto str_ptr = d_str.data(); auto const str_end = str_ptr + d_str.size_bytes(); // end of the string auto const token_count = static_cast(d_tokens.size()); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 9aeb6b69bdc..3be5937297f 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -91,7 +91,7 @@ struct token_reader_fn { } else { if (direction == split_direction::FORWARD) { break; } // we are done for (auto l = 0; l < token_idx - 1; ++l) { - d_result[l] = d_result[l + 1]; // shift left + d_result[l] = d_result[l + 1]; // shift left } d_result[token_idx - 1] = token; } diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 57a868485df..c8c68d19ce6 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -86,9 +86,9 @@ thread_safe_per_context_cache d_special_case_mappings; } // namespace - /** - * @copydoc cudf::strings::detail::get_character_flags_table - */ +/** + * @copydoc cudf::strings::detail::get_character_flags_table + */ character_flags_table_type const* get_character_flags_table() { return d_character_codepoint_flags.find_or_initialize([&](void) { diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 78dfb6bf1a6..1b07b0785f5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -70,7 +70,7 @@ struct normalize_spaces_fn { cudf::string_view const single_space(" ", 1); auto const d_str = d_strings.element(idx); char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - char* optr = buffer; // running output pointer + char* optr = buffer; // running output pointer cudf::size_type nbytes = 0; // holds the number of bytes per output string @@ -146,7 +146,7 @@ struct codepoint_to_utf8_fn { char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range + if (code_point < UTF8_1BYTE) // ASCII range *out_ptr++ = static_cast(code_point); else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index d122f048a4e..34916e121dc 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -114,7 +114,7 @@ using strings_iterator = cudf::column_device_view::const_iterator= end) { break; } // done checking for pairs // skip to the next adjacent pair diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index 1f1b90b3f49..db6ad2e2dd2 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -93,7 +93,7 @@ std::unique_ptr initialize_merge_pairs_map( auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; cuco::empty_key{-1}, - cuco::empty_value{-1}, // empty value is not used + cuco::empty_value{-1}, // empty value is not used bpe_equal{input}, probe_scheme{bpe_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index fbd2d1efcff..a84e94a6924 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -230,7 +230,7 @@ struct multi_delimiter_strings_tokenizer { }); if (itr_find != delimiters_end) { // found delimiter auto token_size = static_cast((curr_ptr - data_ptr) - last_pos); - if (token_size > 0) // we only care about non-zero sized tokens + if (token_size > 0) // we only care about non-zero sized tokens { if (d_str_tokens) d_str_tokens[token_idx] = string_index_pair{data_ptr + last_pos, token_size}; diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 991473c5023..f2909f870aa 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -374,7 +374,7 @@ TEST_F(GroupbyMergeListsTest, StringsColumnInput) "" /*NULL*/, "" /*NULL*/, "German Shepherd", - "" /*NULL*/ + "" /*NULL*/ }, nulls_at({3, 4, 5, 7})}, // key = "dog" lists_col{{"Whale", "" /*NULL*/, "Polar Bear"}, null_at(1)}, // key = "unknown" diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index 67ff61563bb..5fc7e68b524 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -333,7 +333,7 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput) lists_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()} // key = "dog" }; auto const lists3 = lists_col{ - lists_col{"Fuji", "Red Delicious"}, // key = "apple" + lists_col{"Fuji", "Red Delicious"}, // key = "apple" lists_col{{"" /*NULL*/, "Corgi", "German Shepherd", "" /*NULL*/, "Golden Retriever"}, nulls_at({0, 3})}, // key = "dog" lists_col{{"Seeedless", "Mini"}, no_nulls()} // key = "water melon" @@ -343,14 +343,14 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput) merge_sets(vcol_views{keys1, keys2, keys3}, vcol_views{lists1, lists2, lists3}); auto const expected_keys = strings_col{"apple", "banana", "dog", "unknown", "water melon"}; auto const expected_lists = lists_col{ - lists_col{"Fuji", "Honey Bee", "Red Delicious"}, // key = "apple" - lists_col{"Green", "Yellow"}, // key = "banana" + lists_col{"Fuji", "Honey Bee", "Red Delicious"}, // key = "apple" + lists_col{"Green", "Yellow"}, // key = "banana" lists_col{{ "Corgi", "German Shepherd", "Golden Retriever", "Poodle", "" /*NULL*/ }, - null_at(4)}, // key = "dog" - lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)}, // key = "unknown" - lists_col{{"Mini", "Seeedless"}, no_nulls()} // key = "water melon" + null_at(4)}, // key = "dog" + lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)}, // key = "unknown" + lists_col{{"Mini", "Seeedless"}, no_nulls()} // key = "water melon" }; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *out_keys, verbosity); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 64aca091686..81e0e12eeb9 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -2166,7 +2166,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList) cudf::io::table_input_metadata metadata(table1); metadata.column_metadata[0].set_nullability(true); // List is nullable at first (root) level metadata.column_metadata[0].child(1).set_nullability( - false); // non-nullable at second (leaf) level + false); // non-nullable at second (leaf) level metadata.column_metadata[1].set_nullability(true); auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet"); @@ -5880,7 +5880,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested) EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // map ASSERT_EQ(out_map_col.num_children(), 1); - EXPECT_EQ(out_map_col.child(0).name(), "key_value"); // key_value (named in parquet writer) + EXPECT_EQ(out_map_col.child(0).name(), "key_value"); // key_value (named in parquet writer) ASSERT_EQ(out_map_col.child(0).num_children(), 2); EXPECT_EQ(out_map_col.child(0).child(0).name(), "key"); // key (named in parquet writer) EXPECT_EQ(out_map_col.child(0).child(1).name(), "value"); // value (named in parquet writer) @@ -5897,7 +5897,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested) ASSERT_EQ(out_list_col.child(0).num_children(), 1); auto const& out_list_struct_col = out_list_col.child(0).child(0); - EXPECT_EQ(out_list_struct_col.name(), "element"); // elements (named in parquet writer) + EXPECT_EQ(out_list_struct_col.name(), "element"); // elements (named in parquet writer) EXPECT_EQ(out_list_struct_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // struct ASSERT_EQ(out_list_struct_col.num_children(), 2); diff --git a/cpp/tests/lists/reverse_tests.cpp b/cpp/tests/lists/reverse_tests.cpp index a899d387c3e..00dc13c5812 100644 --- a/cpp/tests/lists/reverse_tests.cpp +++ b/cpp/tests/lists/reverse_tests.cpp @@ -370,8 +370,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls) "Kiwi", "Cherry", "Banana", - "", /*NULL*/ - "", /*NULL*/ + "", /*NULL*/ + "", /*NULL*/ "Apple", "", /*NULL*/ "Banana", // end list1 @@ -436,8 +436,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls) "Kiwi", "Cherry", "Banana", - "", /*NULL*/ - "", /*NULL*/ + "", /*NULL*/ + "", /*NULL*/ "Apple", "", /*NULL*/ "Banana", // end list1 diff --git a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp index bf7ebc902ba..84c51f256b7 100644 --- a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp +++ b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp @@ -571,7 +571,7 @@ TEST_F(SetDifferenceTest, InputListsOfNestedStructsHaveNull) "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple", "Banana", "Cherry", "Kiwi", // end list1 "" /*NULL*/, "Bear", "Cat", "Dog", "Duck", - "Panda", // end list2 + "Panda", // end list2 "ÁÁÁ", "ÉÉÉÉÉ", "ÁBC", "ÁÁÁ", "ÍÍÍÍÍ", "" /*NULL*/, "XYZ", "ÁBC" // end list3 diff --git a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp index dbccf06036b..11f98af3520 100644 --- a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp +++ b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp @@ -514,7 +514,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull) null, // end list1 null, // end list2 null, - null // end list3 + null // end list3 }, all_nulls()}; auto grandchild2 = strings_col{{ @@ -522,7 +522,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull) "Apple", // end list1 "" /*NULL*/, // end list2 "ÁÁÁ", - "ÉÉÉÉÉ" // end list3 + "ÉÉÉÉÉ" // end list3 }, nulls_at({0, 2})}; auto child1 = structs_col{{grandchild1, grandchild2}, null_at(0)}; diff --git a/cpp/tests/lists/set_operations/union_distinct_tests.cpp b/cpp/tests/lists/set_operations/union_distinct_tests.cpp index 5cc0897351d..e33ea31541b 100644 --- a/cpp/tests/lists/set_operations/union_distinct_tests.cpp +++ b/cpp/tests/lists/set_operations/union_distinct_tests.cpp @@ -560,7 +560,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull) auto grandchild2 = strings_col{{ "" /*NULL*/, "Apple", "Banana", "Cherry", "Kiwi", "Banana", "Cherry", - "Kiwi", // end list1 + "Kiwi", // end list1 "" /*NULL*/, "Bear", "Cat", "Dog", "Duck", "Panda", "Bear", "Cat", "Dog", "Duck", "Panda", // end list2 @@ -597,7 +597,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull) { "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple", "Apple", "Banana", "Cherry", "Kiwi", "Banana", "Cherry", - "Kiwi", // end list1 + "Kiwi", // end list1 "" /*NULL*/, "" /*NULL*/, "Bear", "Cat", "Dog", "Duck", "Panda", "Bear", "Cat", "Dog", "Duck", "Panda", // end list2 "ÁÁÁ", "ÁÁÁ", "ÉÉÉÉÉ", "ÉÉÉÉÉ", "ÁBC", "ÁÁÁ", "ÍÍÍÍÍ", diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 57d1714c255..fbc637f9315 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -529,7 +529,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) 2, 3, 3, - 3}, // end list3 + 3}, // end list3 nulls_at({1, 6, 12, 13})}; auto child2 = strings_col{{ // begin list1 "XXX", /*NULL*/ @@ -551,7 +551,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) "ÁBC", "ÁÁÁ", "ÍÍÍÍÍ", - "", /*NULL*/ + "", /*NULL*/ "XYZ", "ÁBC"}, // end list3 nulls_at({6, 17})}; @@ -670,7 +670,7 @@ TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull) "ÁBC", "ÁÁÁ", "ÍÍÍÍÍ", - "", /*NULL*/ + "", /*NULL*/ "XYZ", "ÁBC" // end list3 }, @@ -729,8 +729,8 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists) floats_lists{3, 4, 5}, // end list2 // begin list3 floats_lists{}, - floats_lists{}, // end list3 - // begin list4 + floats_lists{}, // end list3 + // begin list4 floats_lists{6, 7}, floats_lists{6, 7}, floats_lists{6, 7}}; diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp index eba6c961bbb..e8ea9d619c5 100644 --- a/cpp/tests/reshape/interleave_columns_tests.cpp +++ b/cpp/tests/reshape/interleave_columns_tests.cpp @@ -806,7 +806,7 @@ TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedInputListsOfListsWithNulls) ListsCol{ListsCol{{null, 11}, null_at(0)}, ListsCol{{22, null, null}, nulls_at({1, 2})}}, // don't care ListsCol{ListsCol{{null, 11}, null_at(0)}, - ListsCol{{22, null, null}, nulls_at({1, 2})}} // don't care + ListsCol{{22, null, null}, nulls_at({1, 2})}} // don't care }; auto const col1 = cudf::slice(col1_original, {3, 6})[0]; diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp index 585383f28f8..eed9db1fe04 100644 --- a/cpp/tests/rolling/range_rolling_window_test.cpp +++ b/cpp/tests/rolling/range_rolling_window_test.cpp @@ -91,7 +91,7 @@ struct window_exec { ScalarT preceding; // Preceding window scalar. ScalarT following; // Following window scalar. cudf::size_type min_periods = 1; -}; // struct window_exec; +}; // struct window_exec; struct RangeRollingTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp index b3f98eb54b9..da9666cbc74 100644 --- a/cpp/tests/sort/segmented_sort_tests.cpp +++ b/cpp/tests/sort/segmented_sort_tests.cpp @@ -270,7 +270,7 @@ TEST_F(SegmentedSortInt, Sliced) column_wrapper expected2{{0, 1, 3, 2, 4, 5, 6}}; column_wrapper expected3{{0, 1, 2, 3, 4, 5, 6}}; // clang-format on - auto slice = cudf::slice(col1, {4, 11})[0]; // 7 elements + auto slice = cudf::slice(col1, {4, 11})[0]; // 7 elements cudf::table_view input{{slice}}; auto seg_slice = cudf::slice(segments2, {2, 4})[0]; // 2 elements diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index a16da41af7a..c595977c269 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -50,17 +50,17 @@ TEST_P(CharsTypes, AllTypes) "\t\r\n\f "}; bool expecteds[] = {false, false, false, false, false, false, false, false, - false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, true, false, false, // decimal false, false, false, false, false, false, false, false, - false, true, false, true, false, true, false, false, // numeric + false, true, false, true, false, true, false, false, // numeric false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, // digit + false, false, false, true, false, true, false, false, // digit true, true, false, true, false, false, false, false, - false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, true, false, // alpha false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, true, // space + false, false, false, false, false, false, false, true, // space false, false, false, true, false, false, false, false, - false, false, false, false, false, false, false, false, // upper + false, false, false, false, false, false, false, false, // upper false, true, false, false, false, false, false, false, false, false, false, false, false, false, true, false}; // lower diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp index 0c7a1ad8042..1902f907f43 100644 --- a/cpp/tests/strings/durations_tests.cpp +++ b/cpp/tests/strings/durations_tests.cpp @@ -398,7 +398,7 @@ TEST_F(StringsDurationsTest, ParseSingle) "-59", "999", "-999", - "", // error + "", // error "01", ""}; // error auto size = cudf::column_view(string_src).size(); @@ -449,7 +449,7 @@ TEST_F(StringsDurationsTest, ParseMultiple) "-59:00:00", "999:00:00", "-999:00:00", - "", // error + "", // error "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); @@ -503,7 +503,7 @@ TEST_F(StringsDurationsTest, ParseSubsecond) "-59:00:00", "999:00:00", "-999:00:00", - "", // error + "", // error "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); @@ -660,7 +660,7 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "09:00 AM", // error "", // error "01:01:01", - ""}; // error + ""}; // error cudf::test::fixed_width_column_wrapper expected_s3( {0, diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index bae402155e9..620e0bfe8de 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -440,7 +440,7 @@ class corresponding_rows_not_equivalent { // Must handle inf and nan separately if (std::isinf(x) || std::isinf(y)) { - return x != y; // comparison of (inf==inf) returns true + return x != y; // comparison of (inf==inf) returns true } else if (std::isnan(x) || std::isnan(y)) { return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false } else { From 97501d87e2070e8f07eb17b2c5e59742c490c6b1 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 20 Sep 2023 07:42:20 +0530 Subject: [PATCH 14/23] Long string optimization for string column parsing in JSON reader (#13803) closes #13724 In old code, 1 thread per string is allocated for parsing a string column. For longer strings (>1024), the runtime of 1-thread-per-string to decode is taking too long even for few strings. In this change, 1 warp per string is used for parsing for strings length <=1024 and 1 block per string for string length >1024. If max string length < 128, 1 thread per string is used as usual. 256 threads_per_block is used for both kernels. Code for 1-warp-per-string and 1-block-per-string is similar, but only varies with warp-wide and block-wide primitives for reduction and scan operations. shared memory usage will differ slightly too. Authors: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) - Elias Stehle (https://github.com/elstehle) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/13803 --- cpp/CMakeLists.txt | 2 + cpp/include/cudf/io/detail/data_casting.cuh | 431 -------- cpp/src/io/json/json_column.cu | 39 +- cpp/src/io/json/nested_json_gpu.cu | 22 +- cpp/src/io/json/write_json.cu | 3 +- cpp/src/io/utilities/data_casting.cu | 987 ++++++++++++++++++ cpp/src/io/utilities/parsing_utils.cuh | 24 +- cpp/src/io/utilities/string_parsing.hpp | 79 ++ .../{type_inference.cuh => type_inference.cu} | 57 +- cpp/tests/io/json_test.cpp | 119 +++ cpp/tests/io/json_type_cast_test.cu | 189 +++- cpp/tests/io/type_inference_test.cu | 30 +- 12 files changed, 1395 insertions(+), 587 deletions(-) delete mode 100644 cpp/include/cudf/io/detail/data_casting.cuh create mode 100644 cpp/src/io/utilities/data_casting.cu create mode 100644 cpp/src/io/utilities/string_parsing.hpp rename cpp/src/io/utilities/{type_inference.cuh => type_inference.cu} (84%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 900e9eed98e..a84f7bd5224 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -413,11 +413,13 @@ add_library( src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp + src/io/utilities/data_casting.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp + src/io/utilities/type_inference.cu src/io/utilities/trie.cu src/jit/cache.cpp src/jit/parser.cpp diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh deleted file mode 100644 index b7ee5e05e96..00000000000 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include - -namespace cudf::io::json::detail { - -// Unicode code point escape sequence -static constexpr char UNICODE_SEQ = 0x7F; - -// Invalid escape sequence -static constexpr char NON_ESCAPE_CHAR = 0x7E; - -// Unicode code point escape sequence prefix comprises '\' and 'u' characters -static constexpr size_type UNICODE_ESC_PREFIX = 2; - -// Unicode code point escape sequence comprises four hex characters -static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; - -// A unicode code point escape sequence is \uXXXX -static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; - -static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; -static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; - -/** - * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or - * whether type casting failed. - */ -enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; - -/** - * @brief Providing additional information about the type casting result. - */ -struct data_casting_result_info { - // Number of bytes written to output - size_type bytes; - // Whether parsing succeeded, item was parsed to null, or failed - data_casting_result result; -}; - -/** - * @brief Returns the character to output for a given escaped character that's following a - * backslash. - * - * @param escaped_char The character following the backslash. - * @return The character to output for a given character that's following a backslash - */ -__device__ __forceinline__ char get_escape_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return '"'; - case '\\': return '\\'; - case '/': return '/'; - case 'b': return '\b'; - case 'f': return '\f'; - case 'n': return '\n'; - case 'r': return '\r'; - case 't': return '\t'; - case 'u': return UNICODE_SEQ; - default: return NON_ESCAPE_CHAR; - } -} - -/** - * @brief Returns the escaped characters for a given character. - * - * @param escaped_char The character to escape. - * @return The escaped characters for a given character. - */ -__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return {'\\', '"'}; - case '\\': return {'\\', '\\'}; - case '/': return {'\\', '/'}; - case '\b': return {'\\', 'b'}; - case '\f': return {'\\', 'f'}; - case '\n': return {'\\', 'n'}; - case '\r': return {'\\', 'r'}; - case '\t': return {'\\', 't'}; - // case 'u': return UNICODE_SEQ; - default: return {'\0', escaped_char}; - } -} -/** - * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence - * \uXXXX. - * - * @param str Pointer to the first (most-significant) hex digit - * @return The parsed hex value if successful, -1 otherwise. - */ -__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) -{ - // Prepare result - int32_t result = 0, base = 1; - constexpr int32_t hex_radix = 16; - - // Iterate over hex digits right-to-left - size_type index = UNICODE_HEX_DIGIT_COUNT; - while (index-- > 0) { - char const ch = str[index]; - if (ch >= '0' && ch <= '9') { - result += static_cast((ch - '0') + 0) * base; - base *= hex_radix; - } else if (ch >= 'A' && ch <= 'F') { - result += static_cast((ch - 'A') + 10) * base; - base *= hex_radix; - } else if (ch >= 'a' && ch <= 'f') { - result += static_cast((ch - 'a') + 10) * base; - base *= hex_radix; - } else { - return -1; - } - } - return result; -} - -/** - * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to - * \p out_it - */ -constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) -{ - auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) - : strings::detail::from_char_utf8(character, out_it); - if (out_it) out_it += bytes; - return bytes; -} - -/** - * @brief Processes a string, replaces escape sequences and optionally strips off the quote - * characters. - * - * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to - * char - * @param in_begin Iterator to the first item to process - * @param in_end Iterator to one past the last item to process - * @param d_buffer Output character buffer to the first item to write - * @param options Settings for controlling string processing behavior - * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is - * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether - * parsing succeeded, item was parsed to null, or failed. - */ -template -__device__ __forceinline__ data_casting_result_info -process_string(in_iterator_t in_begin, - in_iterator_t in_end, - char* d_buffer, - cudf::io::parse_options_view const& options) -{ - int32_t bytes = 0; - auto const num_in_chars = thrust::distance(in_begin, in_end); - // String values are indicated by keeping the quote character - bool const is_string_value = - num_in_chars >= 2LL && - (options.quotechar == '\0' || - (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); - - // Copy literal/numeric value - if (not is_string_value) { - while (in_begin != in_end) { - if (d_buffer) *d_buffer++ = *in_begin; - ++in_begin; - ++bytes; - } - return {bytes, data_casting_result::PARSING_SUCCESS}; - } - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const backslash_char = '\\'; - - // Escape-flag, set after encountering a backslash character - bool escape = false; - - // Exclude beginning and ending quote chars from string range - if (!options.keepquotes) { - ++in_begin; - --in_end; - } - - // Iterate over the input - while (in_begin != in_end) { - // Copy single character to output - if (!escape) { - escape = (*in_begin == backslash_char); - if (!escape) { - if (d_buffer) *d_buffer++ = *in_begin; - ++bytes; - } - ++in_begin; - continue; - } - - // Previous char indicated beginning of escape sequence - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(*in_begin); - - // We escaped an invalid escape character -> "fail"/null for this item - if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Regular, single-character escape - if (escaped_char != UNICODE_SEQ) { - if (d_buffer) *d_buffer++ = escaped_char; - ++bytes; - ++in_begin; - continue; - } - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - // Skip over the 'u' char from \uXXXX to the first hex digit - ++in_begin; - - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { - return {bytes, data_casting_result::PARSING_FAILURE}; - } - - auto hex_val = parse_unicode_hex(in_begin); - - // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item - if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Skip over the four hex digits - thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - int32_t hex_low_val = 0; - if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && - *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { - // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low - // surrogate - hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); - } - - // This is indeed a UTF16 surrogate pair - if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && - hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { - // Skip over the second \uXXXX sequence - thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); - - // Compute UTF16-encoded code point - uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + - (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - } - - // The last character of the input is a backslash -> "fail"/null for this item - if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } - return {bytes, data_casting_result::PARSING_SUCCESS}; -} - -template -struct string_parse { - str_tuple_it str_tuples; - bitmask_type* null_mask; - size_type* null_count_data; - cudf::io::parse_options_view const options; - size_type* d_offsets{}; - char* d_chars{}; - - __device__ void operator()(size_type idx) - { - if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const in_begin = str_tuples[idx].first; - auto const in_end = in_begin + str_tuples[idx].second; - auto const num_in_chars = str_tuples[idx].second; - - // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; - return; - } - - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - auto str_process_info = process_string(in_begin, in_end, d_buffer, options); - if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - if (null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - } - if (!d_chars) d_offsets[idx] = 0; - } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; - } - } -}; -/** - * @brief Parses the data from an iterator of string views, casting it to the given target data type - * - * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair - * @param col_size The total number of items of this column - * @param col_type The column's target data type - * @param null_mask A null mask that renders certain items from the input invalid - * @param options Settings for controlling the processing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr The resource to be used for device memory allocation - * @return The column that contains the parsed data - */ -template -std::unique_ptr parse_data(str_tuple_it str_tuples, - size_type col_size, - data_type col_type, - B&& null_mask, - size_type null_count, - cudf::io::parse_options_view const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - auto d_null_count = rmm::device_scalar(null_count, stream); - auto null_count_data = d_null_count.data(); - - if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null count may be updated by parse failures - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, - col_size, - stream, - mr); - - return make_strings_column(col_size, - std::move(offsets), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); - } - - auto out_col = - make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); - - // use existing code (`ConvertFunctor`) to convert values - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( - size_type row) { - if (col.is_null(row)) { return; } - auto const in = str_tuples[row]; - - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); - - if (is_null_literal) { - col.set_null(row); - atomicAdd(null_count_data, 1); - return; - } - - // If this is a string value, remove quotes - auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); - - auto const is_parsed = cudf::type_dispatcher(col_type, - ConvertFunctor{}, - in_begin, - in_end, - col.data(), - row, - col_type, - options, - false); - if (not is_parsed) { - col.set_null(row); - atomicAdd(null_count_data, 1); - } - }); - - out_col->set_null_count(d_null_count.value(stream)); - - return out_col; -} - -} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index cabf904f020..5d7fb9d6b43 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,14 +16,13 @@ #include "nested_json.hpp" #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -331,23 +330,27 @@ std::vector copy_strings_to_host(device_span input, { CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); - rmm::device_uvector> string_views(num_strings, stream); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); thrust::transform(rmm::exec_policy(stream), d_offset_pairs, d_offset_pairs + num_strings, - string_views.begin(), - [data = input.data()] __device__(auto const& offsets) { + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { // Note: first character for non-field columns - return thrust::make_pair( - data + thrust::get<0>(offsets), + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); }); cudf::io::parse_options_view options_view{}; options_view.quotechar = '\0'; // no quotes options_view.keepquotes = true; - auto d_column_names = parse_data(string_views.begin(), + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, num_strings, data_type{type_id::STRING}, rmm::device_buffer{}, @@ -355,7 +358,7 @@ std::vector copy_strings_to_host(device_span input, options_view, stream, rmm::mr::get_current_device_resource()); - auto to_host = [stream](auto const& col) { + auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( @@ -763,19 +766,6 @@ std::pair, std::vector> device_json_co // TODO how about directly storing pair in json_column? auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); data_type target_type{}; @@ -790,12 +780,13 @@ std::pair, std::vector> device_json_co // Infer column type, if we don't have an explicit type for it else { target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, string_ranges_it, col_size, stream); + options.json_view(), d_input, offset_length_it, col_size, stream); } auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0b49f97597d..06ac11485cb 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -19,14 +19,13 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -1949,20 +1948,6 @@ std::pair, std::vector> json_column_to auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - data_type target_type{}; if (schema.has_value()) { @@ -1978,7 +1963,7 @@ std::pair, std::vector> json_column_to target_type = cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(), d_input, - string_ranges_it, + offset_length_it, col_size, stream); } @@ -1986,7 +1971,8 @@ std::pair, std::vector> json_column_to auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1e44522ed33..2d363c51fce 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -20,6 +20,7 @@ */ #include +#include #include #include @@ -27,9 +28,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu new file mode 100644 index 00000000000..1772e5e43fa --- /dev/null +++ b/cpp/src/io/utilities/data_casting.cu @@ -0,0 +1,987 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cudf::io::json::detail { + +constexpr auto SINGLE_THREAD_THRESHOLD = 128; +constexpr auto WARP_THRESHOLD = 128 * 128; // 16K + +// Unicode code point escape sequence +static constexpr char UNICODE_SEQ = 0x7F; + +// Invalid escape sequence +static constexpr char NON_ESCAPE_CHAR = 0x7E; + +// Unicode code point escape sequence prefix comprises '\' and 'u' characters +static constexpr size_type UNICODE_ESC_PREFIX = 2; + +// Unicode code point escape sequence comprises four hex characters +static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; + +// A unicode code point escape sequence is \uXXXX +static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; + +static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; +static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; + +/** + * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or + * whether type casting failed. + */ +enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; + +/** + * @brief Providing additional information about the type casting result. + */ +struct data_casting_result_info { + // Number of bytes written to output + size_type bytes; + // Whether parsing succeeded, item was parsed to null, or failed + data_casting_result result; +}; + +/** + * @brief Returns the character to output for a given escaped character that's following a + * backslash. + * + * @param escaped_char The character following the backslash. + * @return The character to output for a given character that's following a backslash + */ +__device__ __forceinline__ char get_escape_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return '"'; + case '\\': return '\\'; + case '/': return '/'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'u': return UNICODE_SEQ; + default: return NON_ESCAPE_CHAR; + } +} + +/** + * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence + * \uXXXX. + * + * @param str Pointer to the first (most-significant) hex digit + * @return The parsed hex value if successful, -1 otherwise. + */ +__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) +{ + // Prepare result + int32_t result = 0, base = 1; + constexpr int32_t hex_radix = 16; + + // Iterate over hex digits right-to-left + size_type index = UNICODE_HEX_DIGIT_COUNT; + while (index-- > 0) { + char const ch = str[index]; + if (ch >= '0' && ch <= '9') { + result += static_cast((ch - '0') + 0) * base; + base *= hex_radix; + } else if (ch >= 'A' && ch <= 'F') { + result += static_cast((ch - 'A') + 10) * base; + base *= hex_radix; + } else if (ch >= 'a' && ch <= 'f') { + result += static_cast((ch - 'a') + 10) * base; + base *= hex_radix; + } else { + return -1; + } + } + return result; +} + +/** + * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to + * \p out_it + */ +constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) +{ + auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) + : strings::detail::from_char_utf8(character, out_it); + if (out_it) out_it += bytes; + return bytes; +} + +/** + * @brief Processes a string, replaces escape sequences and optionally strips off the quote + * characters. + * + * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to + * char + * @param in_begin Iterator to the first item to process + * @param in_end Iterator to one past the last item to process + * @param d_buffer Output character buffer to the first item to write + * @param options Settings for controlling string processing behavior + * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is + * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether + * parsing succeeded, item was parsed to null, or failed. + */ +template +__device__ __forceinline__ data_casting_result_info +process_string(in_iterator_t in_begin, + in_iterator_t in_end, + char* d_buffer, + cudf::io::parse_options_view const& options) +{ + int32_t bytes = 0; + auto const num_in_chars = thrust::distance(in_begin, in_end); + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + + // Copy literal/numeric value + if (not is_string_value) { + bytes += (in_end - in_begin); + if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer); + return {bytes, data_casting_result::PARSING_SUCCESS}; + } + char constexpr backslash_char = '\\'; + + // Escape-flag, set after encountering a backslash character + bool is_prev_char_escape = false; + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + + // Iterate over the input + while (in_begin != in_end) { + // Copy single character to output + if (!is_prev_char_escape) { + is_prev_char_escape = (*in_begin == backslash_char); + if (!is_prev_char_escape) { + if (d_buffer) *d_buffer++ = *in_begin; + ++bytes; + } + ++in_begin; + continue; + } + + // Previous char indicated beginning of escape sequence + // Reset escape flag for next loop iteration + is_prev_char_escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(*in_begin); + + // We escaped an invalid escape character -> "fail"/null for this item + if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Regular, single-character escape + if (escaped_char != UNICODE_SEQ) { + if (d_buffer) *d_buffer++ = escaped_char; + ++bytes; + ++in_begin; + continue; + } + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + // Skip over the 'u' char from \uXXXX to the first hex digit + ++in_begin; + + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { + return {bytes, data_casting_result::PARSING_FAILURE}; + } + + auto hex_val = parse_unicode_hex(in_begin); + + // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item + if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Skip over the four hex digits + thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int32_t hex_low_val = 0; + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && + *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { + // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low + // surrogate + hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); + } + + // This is indeed a UTF16 surrogate pair + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Skip over the second \uXXXX sequence + thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); + + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + bytes += write_utf8_char(utf8_chars, d_buffer); + } else { + // Just a single \uXXXX sequence + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + bytes += write_utf8_char(utf8_chars, d_buffer); + } + } + + // The last character of the input is a backslash -> "fail"/null for this item + if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } + return {bytes, data_casting_result::PARSING_SUCCESS}; +} + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * warp. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_warp { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5+32 for each warp. + bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) { + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0; + } + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = + is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; + __syncwarp(); + } + + /// Each thread in a warp sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = + is_escaping_backslash; + __syncwarp(); + } + + /// Each thread in a warp gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index]; + } +}; + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * block. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_block { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5 + num_warps*32 for entire block + bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; } + is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) + is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x]; + __syncthreads(); + } + + /// Each thread in a block sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; + __syncthreads(); + } + + /// Each thread in a block gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[UNICODE_LOOK_BACK + bit_index]; + } +}; + +// Algorithm: warp/block parallel version of string_parse and process_string() +// Decoding character classes (u8, u16, \*, *): +// character count: input->output +// \uXXXX 6->2/3/4 +// \uXXXX\uXXXX 12->2/3/4 +// \" 2->1 +// * 1->1 +// +// ERROR conditions. (all collaborating threads quit) +// c=='\' & curr_idx == end_idx-1; +// [c-1]=='\' & get_escape[c]==NEC +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex +// +// skip conditions. (current thread skips this char, no output) +// c=='\' skip. (Escaping char only) +// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. +// +// write conditions. (write to d_buffer) +// [c-1]!='\' & [c]!='\' write [c] +// [c-1]!='\' & [c]=='\' skip (already covered in skip conditions) +// [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE +// [c+1:4]=curr_hex_val +// // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c+7:4]=next_hex_val +// // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val +// // if prev_hex_val in high, curr_hex_val in low, skip. +// // if curr_hex_val in high, next_hex_val in low, write [u16] +// if curr_hex_val not in high, write [u8] +// before writing, find num of output characters per threads, +// then do intra-warp/intra-block scan for out_idx +// propagate offset from next iteration to carry forward. +// Uses 1 warp per string or 1 block per string + +/** + * @brief Warp/Block parallel version of string_parse functor + * + * @tparam is_warp True if 1 warp per string, False if 1 block per string + * @tparam num_warps Number of warps per block + * @tparam str_tuple_it Iterator type for tuple with string pointer and its length + * @param str_tuples iterator of tuple with string pointer and its length + * @param total_out_strings Number of string rows to be processed + * @param str_counter Counter to keep track of processed number of strings + * @param null_mask Null mask + * @param null_count_data pointer to store null count + * @param options Settings for controlling string processing behavior + * @param d_offsets Offsets to identify where to store the results for each string + * @param d_chars Character array to store the characters of strings + */ +template +__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) +{ + constexpr auto BLOCK_SIZE = + is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; + size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x; + + // get 1-string index per warp/block + auto get_next_string = [&]() { + if constexpr (is_warp) { + size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + return __shfl_sync(0xffffffff, istring, 0); + } else { + // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's + // istring value + __syncthreads(); + __shared__ size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncthreads(); + return istring; + } + }; + // grid-stride loop. + for (size_type istring = get_next_string(); istring < total_out_strings; + istring = get_next_string()) { + // skip nulls + if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { + if (!d_chars && lane == 0) d_offsets[istring] = 0; + continue; // gride-stride return; + } + + auto in_begin = str_tuples[istring].first; + auto in_end = in_begin + str_tuples[istring].second; + auto const num_in_chars = str_tuples[istring].second; + if constexpr (is_warp) { + if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue; + } else { + if (num_in_chars <= WARP_THRESHOLD) continue; + } + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; + } + } + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; + + // Copy literal/numeric value + if (not is_string_value) { + if (!d_chars) { + if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + } else { + for (thread_index_type char_index = lane; char_index < (in_end - in_begin); + char_index += BLOCK_SIZE) { + d_buffer[char_index] = in_begin[char_index]; + } + } + continue; // gride-stride return; + } + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + // warp-parallelized or block-parallelized process_string() + + auto is_hex = [](auto ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + }; + + // for backslash scan calculation: is_previous_escaping_backslash + [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size; + bool init_state_reg; + __shared__ bool init_state_shared; + size_type last_offset_reg; + __shared__ size_type last_offset_shared; + bool& init_state(is_warp ? init_state_reg : init_state_shared); + size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared); + if (is_warp || lane == 0) { + init_state = false; + last_offset = 0; + } + using bitfield = + std::conditional_t, bitfield_block>; + __shared__ bitfield is_slash; + is_slash.reset(warp_id); + __syncthreads(); + // 0-31, 32-63, ... i*32-n. + // entire warp executes but with mask. + for (thread_index_type char_index = lane; + char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); + char_index += BLOCK_SIZE) { + bool const is_within_bounds = char_index < (in_end - in_begin); + auto const MASK = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff; + auto const c = is_within_bounds ? in_begin[char_index] : '\0'; + auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; + auto const escaped_char = get_escape_char(c); + + bool is_escaping_backslash{false}; + [[maybe_unused]] bool is_prev_escaping_backslash{false}; + // To check current is backslash by checking if previous is backslash. + // curr = !prev & c=='\\' + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) (intra-block FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. + + struct state_table { + // using bit fields instead of state[2] + bool state0 : 1; + bool state1 : 1; + bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; } + }; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + return state_table{op1.state0 ? op2.state1 : op2.state0, + op1.state1 ? op2.state1 : op2.state0}; + }; + state_table scanned; + // inclusive scan of escaping backslashes + if constexpr (is_warp) { + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + __syncwarp(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + } else { + using SlashScan = cub::BlockScan; + __shared__ typename SlashScan::TempStorage temp_slash; + SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + __syncthreads(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + // There is another __syncthreads() at the end of for-loop. + } + + // String with parsing errors are made as null + bool error = false; + if (is_within_bounds) { + // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT + // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex + error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); + error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); + error |= (is_prev_escaping_backslash && c == 'u' && + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + } + // Make sure all threads have no errors before continuing + if constexpr (is_warp) { + error = __any_sync(MASK, error); + } else { + using ErrorReduce = cub::BlockReduce; + __shared__ typename ErrorReduce::TempStorage temp_storage_error; + __shared__ bool error_reduced; + error_reduced = ErrorReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. + // only valid in thread0, so shared memory is used for broadcast. + __syncthreads(); + error = error_reduced; + } + // If any thread has an error, skip the rest of the string and make this string as null + if (error) { + if (!d_chars && lane == 0) { + if (null_mask != nullptr) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + } + last_offset = 0; + d_offsets[istring] = 0; + } + if constexpr (!is_warp) { __syncthreads(); } + break; // gride-stride return; + } + + // Skipping non-copied escaped characters + bool skip = !is_within_bounds; // false; + // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX + skip |= is_escaping_backslash; + if (is_within_bounds) { + // skip X for each X in \uXXXX + skip |= + char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; + skip |= + char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u'; + skip |= + char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u'; + skip |= + char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u'; + } + int this_num_out = 0; + cudf::char_utf8 write_char{}; + + if (!skip) { + // 1. Unescaped character + if (!is_prev_escaping_backslash) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 2. Escaped character + if (escaped_char != UNICODE_SEQ) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 3. Unicode + // UTF8 \uXXXX + auto hex_val = parse_unicode_hex(in_begin + char_index + 1); + auto hex_low_val = 0; + // UTF16 \uXXXX\uXXXX + // Note: no need for scanned_backslash below because we already know that + // only '\u' check is enough. + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < + in_end && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') { + hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); + } + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + write_char = strings::detail::codepoint_to_utf8(unicode_code_point); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } else { + // if hex_val is high surrogate, ideally it should be parsing failure. + // but skipping it as other parsers do this too. + if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + // Ideally this should be skipped if previous char is high surrogate. + skip = true; + this_num_out = 0; + write_char = 0; + } else { + // if UTF8 + write_char = strings::detail::codepoint_to_utf8(hex_val); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } + } + } + } + } // !skip end. + { + // compute offset to write output for each thread + size_type offset; + if constexpr (is_warp) { + using OffsetScan = cub::WarpScan; + __shared__ typename OffsetScan::TempStorage temp_storage[num_warps]; + OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + } else { + using OffsetScan = cub::BlockScan; + __shared__ typename OffsetScan::TempStorage temp_storage; + OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset); + __syncthreads(); + } + offset += last_offset; + // Write output + if (d_chars && !skip) { + auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ; + if (is_not_unicode) { + *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char; + } else { + strings::detail::from_char_utf8(write_char, d_buffer + offset); + } + } + offset += this_num_out; + if constexpr (is_warp) { + last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); + } else { + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; + __syncthreads(); + } + } + } // char for-loop + if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + } // grid-stride for-loop +} + +template +struct string_parse { + str_tuple_it str_tuples; + bitmask_type* null_mask; + size_type* null_count_data; + cudf::io::parse_options_view const options; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) + { + if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const in_begin = str_tuples[idx].first; + auto const in_end = in_begin + str_tuples[idx].second; + auto const num_in_chars = str_tuples[idx].second; + + if (num_in_chars > SINGLE_THREAD_THRESHOLD) return; + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[idx] = 0; + return; + } + } + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { + if (null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + } + if (!d_chars) d_offsets[idx] = 0; + } else { + if (!d_chars) d_offsets[idx] = str_process_info.bytes; + } + } +}; + +template +struct to_string_view_pair { + SymbolT const* data; + to_string_view_pair(SymbolT const* _data) : data(_data) {} + __device__ auto operator()(thrust::tuple ip) + { + return thrust::pair{data + thrust::get<0>(ip), + static_cast(thrust::get<1>(ip))}; + } +}; + +template +static std::unique_ptr parse_string(string_view_pair_it str_tuples, + size_type col_size, + rmm::device_buffer&& null_mask, + rmm::device_scalar& d_null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // CUDF_FUNC_RANGE(); + + auto const max_length = thrust::transform_reduce( + rmm::exec_policy(stream), + str_tuples, + str_tuples + col_size, + [] __device__(auto t) { return t.second; }, + size_type{0}, + thrust::maximum{}); + + auto offsets = cudf::make_numeric_column( + data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + auto null_count_data = d_null_count.data(); + + auto single_thread_fn = string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + // TODO run these independent kernels in parallel streams. + if (max_length > SINGLE_THREAD_THRESHOLD) { + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + + if (max_length > WARP_THRESHOLD) { + // for strings longer than WARP_THRESHOLD, 1 block per string + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", + std::overflow_error); + + // CHARS column + std::unique_ptr chars = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars = chars->mutable_view().data(); + + single_thread_fn.d_chars = d_chars; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + if (max_length > SINGLE_THREAD_THRESHOLD) { + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + if (max_length > WARP_THRESHOLD) { + str_counter.set_value(0, stream); + // for strings longer than WARP_THRESHOLD, 1 block per string + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); +} + +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + if (col_size == 0) { return make_empty_column(col_type); } + auto d_null_count = rmm::device_scalar(null_count, stream); + auto null_count_data = d_null_count.data(); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data}); + + if (col_type == cudf::data_type{cudf::type_id::STRING}) { + return parse_string(str_tuples, + col_size, + std::forward(null_mask), + d_null_count, + options, + stream, + mr); + } + + auto out_col = + make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); + auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); + + // use `ConvertFunctor` to convert non-string values + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( + size_type row) { + if (col.is_null(row)) { return; } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + + if (is_null_literal) { + col.set_null(row); + atomicAdd(null_count_data, 1); + return; + } + + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); + + auto const is_parsed = cudf::type_dispatcher(col_type, + ConvertFunctor{}, + in_begin, + in_end, + col.data(), + row, + col_type, + options, + false); + if (not is_parsed) { + col.set_null(row); + atomicAdd(null_count_data, 1); + } + }); + + out_col->set_null_count(d_null_count.value(stream)); + + return out_col; +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 5c3af588411..43d62fcd513 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,28 @@ struct parse_options { } }; +/** + * @brief Returns the escaped characters for a given character. + * + * @param escaped_char The character to escape. + * @return The escaped characters for a given character. + */ +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} + /** * @brief Returns the numeric value of an ASCII/UTF-8 character. * Handles hexadecimal digits, both uppercase and lowercase diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp new file mode 100644 index 00000000000..12fc0a5b2e7 --- /dev/null +++ b/cpp/src/io/utilities/string_parsing.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cudf::io { +namespace detail { + +/** + * @brief Infers data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string + * @throw cudf::logic_error if data type inference failed + * + * @param options View of inference options + * @param data JSON string input + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The inferred data type + */ +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream); +} // namespace detail + +namespace json::detail { + +/** + * @brief Parses the data from an iterator of string views, casting it to the given target data type + * + * @param data string input base pointer + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param col_size The total number of items of this column + * @param col_type The column's target data type + * @param null_mask A null mask that renders certain items from the input invalid + * @param options Settings for controlling the processing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr The resource to be used for device memory allocation + * @return The column that contains the parsed data + */ +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace json::detail +} // namespace cudf::io diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu similarity index 84% rename from cpp/src/io/utilities/type_inference.cuh rename to cpp/src/io/utilities/type_inference.cu index a9ccc80ca33..79a5c8f1c4c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,23 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include -#include +#include #include #include -#include #include -#include -#include #include -#include -#include - #include #include @@ -114,14 +107,14 @@ __device__ __inline__ bool is_like_float(std::size_t len, * * @param[in] options View of inference options * @param[in] data JSON string input - * @param[in] column_strings_begin The beginning of an offset-length tuple sequence + * @param[in] offset_length_begin The beginning of an offset-length tuple sequence * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ template __global__ void infer_column_type_kernel(OptionsView options, device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { @@ -129,8 +122,8 @@ __global__ void infer_column_type_kernel(OptionsView options, for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { - auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); - auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_offset = thrust::get<0>(*(offset_length_begin + idx)); + auto const field_len = thrust::get<1>(*(offset_length_begin + idx)); auto const field_begin = data.begin() + field_offset; if (cudf::detail::serialized_trie_contains( @@ -234,7 +227,7 @@ __global__ void infer_column_type_kernel(OptionsView options, * * @param options View of inference options * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence + * @param offset_length_begin The beginning of an offset-length tuple sequence * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return A histogram containing column-specific type counters @@ -242,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options, template cudf::io::column_type_histogram infer_column_type(OptionsView const& options, cudf::device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t const size, rmm::cuda_stream_view stream) { @@ -254,40 +247,22 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); infer_column_type_kernel<<>>( - options, data, column_strings_begin, size, d_column_info.data()); + options, data, offset_length_begin, size, d_column_info.data()); return d_column_info.value(stream); } -/** - * @brief Infers data type for a given JSON string input `data`. - * - * @throw cudf::logic_error if input size is 0 - * @throw cudf::logic_error if date time is not inferred as string - * @throw cudf::logic_error if data type inference failed - * - * @tparam OptionsView Type of inference options view - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` - * - * @param options View of inference options - * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence - * @param size Size of the string input - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The inferred data type - */ -template -cudf::data_type infer_data_type(OptionsView const& options, - device_span data, - ColumnStringIter column_strings_begin, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); - auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); + auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 220f1a3391f..7c911ac2e04 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1370,6 +1371,124 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); } +TEST_F(JsonReaderTest, JsonLongString) +{ + // Unicode + // 0000-FFFF Basic Multilingual Plane + // 10000-10FFFF Supplementary Plane + cudf::test::strings_column_wrapper col1{ + { + "\"\\/\b\f\n\r\t", + "\"", + "\\", + "/", + "\b", + "\f\n", + "\r\t", + "$€", + "ராபிட்ஸ்", + "C𝞵𝓓𝒻", + "", // null + "", // null + "கார்த்தி", + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF + R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", + R"( \\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\abcd)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + }, + cudf::test::iterators::nulls_at({10, 11})}; + + cudf::test::fixed_width_column_wrapper repeat_times{ + {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}, + cudf::test::iterators::no_nulls()}; + auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); + auto col2 = d_col2->view(); + cudf::table_view const tbl_view{{col1, col2, repeat_times}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + + cudf::table_view const expected = tbl_view; + std::map types; + types["col1"] = data_type{type_id::STRING}; + types["col2"] = data_type{type_id::STRING}; + types["int16"] = data_type{type_id::INT16}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .dtypes(types); + + // Read test data via nested JSON reader + auto const table = cudf::io::read_json(json_lines_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); +} + +TEST_F(JsonReaderTest, ErrorStrings) +{ + // cases of invalid escape characters, invalid unicode encodings. + // Error strings will decode to nulls + auto const buffer = std::string{R"( + {"col0": "\"\a"} + {"col0": "\u"} + {"col0": "\u0"} + {"col0": "\u0b"} + {"col0": "\u00b"} + {"col0": "\u00bz"} + {"col0": "\t34567890123456\t9012345678901\ug0bc"} + {"col0": "\t34567890123456\t90123456789012\u0hbc"} + {"col0": "\t34567890123456\t90123456789012\u00ic"} + {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"} +)"}; + // Last one is not an error case, but shows that unicode in json is copied string column output. + + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .dtypes({data_type{cudf::type_id::STRING}}) + .lines(true) + .legacy(false); + + auto const result = cudf::io::read_json(in_opts); + auto const result_view = result.tbl->view().column(0); + + EXPECT_EQ(result.metadata.schema_info[0].name, "col0"); + EXPECT_EQ(result_view.null_count(), 9); + cudf::test::strings_column_wrapper expected{ + {"", + "", + "", + "", + "", + "", + "", + "", + "", + "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4" + "\xe0\xae\xbf"}, + // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d + // 0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf + cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); +} + TEST_F(JsonReaderTest, TokenAllocation) { std::array const json_inputs{ diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 5c32131114d..9eb5e8f5230 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -21,15 +21,20 @@ #include #include +#include + #include #include #include -#include #include #include #include #include +#include + +#include +#include #include using namespace cudf::test::iterators; @@ -37,13 +42,27 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct to_thrust_pair_fn { - __device__ thrust::pair operator()( - thrust::pair const& p) +struct offsets_to_length { + __device__ cudf::size_type operator()(thrust::tuple const& p) { - return {p.first.data(), p.first.size_bytes()}; + return thrust::get<1>(p) - thrust::get<0>(p); } }; + +/// Returns length of each string in the column +auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) +{ + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); + return svs_length; +} } // namespace auto default_json_options() @@ -67,26 +86,23 @@ TEST_F(JSONTypeCastTest, String) std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); - auto d_column = cudf::column_device_view::create(input); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(input); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto str_col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -103,26 +119,23 @@ TEST_F(JSONTypeCastTest, Int) auto const type = cudf::data_type{cudf::type_id::INT64}; cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -146,26 +159,23 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, @@ -173,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } +TEST_F(JSONTypeCastTest, ErrorNulls) +{ + auto const stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::STRING}; + + // error in decoding + std::vector input_values{R"("\"\a")", + R"("\u")", + R"("\u0")", + R"("\u0b")", + R"("\u00b")", + R"("\u00bz")", + R"("\t34567890123456\t9012345678901\ug0bc")", + R"("\t34567890123456\t90123456789012\u0hbc")", + R"("\t34567890123456\t90123456789012\u00ic")", + R"("\t34567890123456\t9012345678901\")", + R"("\t34567890123456\t90123456789012\")", + R"(null)"}; + // Note: without quotes are copied without decoding + cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end()); + + auto column = cudf::strings_column_view(input); + auto space_length = 128; + auto prepend_space = [&space_length](auto const& s) { + if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1); + return std::string(s); + }; + std::vector small_input; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space); + cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end()); + + std::vector large_input; + space_length = 128 * 128; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space); + cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end()); + + std::vector expected_values{"", "", "", "", "", "", "", "", "", "", "", ""}; + cudf::test::strings_column_wrapper expected( + expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls()); + + // single threads, warp, block. + for (auto const& column : + {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) { + rmm::device_uvector svs_length = string_offset_to_length(column, stream); + + auto null_mask_it = no_nulls(); + auto null_mask = + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index b2eb1b94f9c..a14e7ecf5b3 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 4, 7}; - auto const string_length = std::vector{2, 2, 1}; + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -83,8 +83,8 @@ TEST_F(TypeInference, Null) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 4}; - auto const string_length = std::vector{0, 2, 1}; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 1}; - auto const string_length = std::vector{0, 0, 4}; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -149,8 +149,8 @@ TEST_F(TypeInference, String) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 8, 12}; - auto const string_length = std::vector{6, 3, 4}; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 6, 12}; - auto const string_length = std::vector{4, 5, 5}; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 10}; - auto const string_length = std::vector{8, 9}; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 3, 5, 7, 9}; - auto const string_length = std::vector{1, 1, 1, 1, 1}; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( From 63d197fe029ff2b57f4e0c7ab975bb35f844fc25 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Sep 2023 19:27:10 -0700 Subject: [PATCH 15/23] Avoid circular cimports in _lib/cpp/reduce.pxd (#14125) This Cython modules contains some cimports from higher-level modules than it should, which introduces the possibility for circular import issues. Also it contains an unused import of DeviceScalar that can cause similar issues. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14125 --- python/cudf/cudf/_lib/cpp/reduce.pxd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd index 7952c717916..997782dec6c 100644 --- a/python/cudf/cudf/_lib/cpp/reduce.pxd +++ b/python/cudf/cudf/_lib/cpp/reduce.pxd @@ -1,14 +1,13 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair -from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation +from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport data_type -from cudf._lib.scalar cimport DeviceScalar cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: From 2d4f22a9ab0709f808af9253097037e0eb5d00b1 Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Wed, 20 Sep 2023 13:57:26 +0100 Subject: [PATCH 16/23] Implement `GroupBy.value_counts` to match pandas API (#14114) This PR implements `GroupBy.value_counts`, matching the [pandas equivalent](https://pandas.pydata.org/docs/dev/reference/api/pandas.core.groupby.DataFrameGroupBy.value_counts.html) method. Tests currently ignore the returned Series/DataFrame's name, as this was [added to pandas in v2.0.0](https://github.com/pandas-dev/pandas/commit/bec92a43feb0057f06f4f9b9db26c1a09232b1c0). This can be removed if tests are against `pandas>=2.0.0`. Closes #12789 Authors: - Sam Turner (https://github.com/stmio) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/14114 --- python/cudf/cudf/core/groupby/groupby.py | 164 +++++++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 67 +++++++++ 2 files changed, 231 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b300c55b537..e1740140b44 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2336,6 +2336,170 @@ def pct_change( shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 + def value_counts( + self, + subset=None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrameOrSeries: + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will + have an additional column with the value_counts. The column is + labelled 'count' or 'proportion', depending on the ``normalize`` + parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + + df = cudf.DataFrame.copy(self.obj) + groupings = self.grouping.names + name = "proportion" if normalize else "count" + + if subset is None: + subset = [i for i in df._column_names if i not in groupings] + # Check subset exists in dataframe + elif set(subset) - set(df._column_names): + raise ValueError( + f"Keys {set(subset) - set(df._column_names)} in subset " + f"do not exist in the DataFrame." + ) + # Catch case where groupby and subset share an element + elif set(subset) & set(groupings): + raise ValueError( + f"Keys {set(subset) & set(groupings)} in subset " + "cannot be in the groupby column keys." + ) + + df["__placeholder"] = 1 + result = ( + df.groupby(groupings + list(subset), dropna=dropna)[ + "__placeholder" + ] + .count() + .sort_index() + .astype(np.int64) + ) + + if normalize: + levels = list(range(len(groupings), result.index.nlevels)) + result /= result.groupby( + result.index.droplevel(levels), + ).transform("sum") + + if sort: + result = result.sort_values(ascending=ascending).sort_index( + level=range(len(groupings)), sort_remaining=False + ) + + if not self._as_index: + if name in df._column_names: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result.name = name + result = result.to_frame().reset_index() + else: + result.name = name + + return result + def _mimic_pandas_order( self, result: DataFrameOrSeries ) -> DataFrameOrSeries: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 042f0e1aa38..376639d5226 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3473,3 +3473,70 @@ def test_categorical_grouping_pandas_compatibility(): expected = pdf.groupby("key", sort=False).sum() assert_eq(actual, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", np.nan, "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = pdf.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) + + +def test_group_by_value_counts_subset(): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender").value_counts(["education"]) + expected = pdf.groupby("gender").value_counts(["education"]) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) + + +def test_group_by_value_counts_clash_with_subset(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["a"]) + + +def test_group_by_value_counts_subset_not_exists(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["c"]) + + +def test_group_by_value_counts_with_count_column(): + df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a", as_index=False).value_counts() From 7b0693f6a5fd58e247a7669a813c6ffba850e4e0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Sep 2023 04:46:35 -1000 Subject: [PATCH 17/23] Fix DataFrame.values with no columns but index (#14134) Fixes the following ```python In [32]: cudf.DataFrame(index=range(10)).values Out[32]: array([], shape=(0, 0), dtype=float64) ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/14134 --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6224793d6f1..1e6d177f8ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -437,7 +437,7 @@ def get_column_values_na(col): ncol = self._num_columns if ncol == 0: return make_empty_matrix( - shape=(0, 0), dtype=np.dtype("float64"), order="F" + shape=(len(self), ncol), dtype=np.dtype("float64"), order="F" ) if dtype is None: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index cbef9bfa2d8..b69f22ade81 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10374,3 +10374,9 @@ def test_dataframe_init_from_nested_dict(): pdf = pd.DataFrame(regular_dict) gdf = cudf.DataFrame(regular_dict) assert_eq(pdf, gdf) + + +def test_data_frame_values_no_cols_but_index(): + result = cudf.DataFrame(index=range(5)).values + expected = pd.DataFrame(index=range(5)).values + assert_eq(result, expected) From f7ca051145d41cf323cfb5a066068cb8b75d3fb3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 20 Sep 2023 10:49:06 -0500 Subject: [PATCH 18/23] Fix type of empty `Index` and raise warning in `Series` constructor (#14116) Fixes: #14091 This PR fixes empty inputs dtype in `Index` to default to `str` instead of `float64`. Another change is there is a deprecation warning for `Series` constructor to match pandas. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14116 --- python/cudf/cudf/core/algorithms.py | 21 +++++++---- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 12 ++++++- python/cudf/cudf/core/series.py | 32 +++++++++++++++-- python/cudf/cudf/testing/_utils.py | 21 +++++++++-- python/cudf/cudf/tests/test_dataframe.py | 19 +++++----- python/cudf/cudf/tests/test_dropna.py | 9 +++-- python/cudf/cudf/tests/test_duplicates.py | 4 +-- python/cudf/cudf/tests/test_index.py | 16 ++++++--- python/cudf/cudf/tests/test_rolling.py | 9 +++-- python/cudf/cudf/tests/test_series.py | 43 ++++++++++++++--------- python/cudf/cudf/tests/test_stats.py | 23 ++++++------ 12 files changed, 148 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index a472142ece0..25d58029d6b 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -4,12 +4,13 @@ import cupy as cp import numpy as np +from cudf.core.column import as_column from cudf.core.copy_types import BooleanMask -from cudf.core.index import Index, RangeIndex +from cudf.core.index import RangeIndex, as_index from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar -from cudf.core.series import Series from cudf.options import get_option +from cudf.utils.dtypes import can_convert_to_column def factorize( @@ -95,7 +96,13 @@ def factorize( return_cupy_array = isinstance(values, cp.ndarray) - values = Series(values) + if not can_convert_to_column(values): + raise TypeError( + "'values' can only be a Series, Index, or CuPy array, " + f"got {type(values)}" + ) + + values = as_column(values) if na_sentinel is None: na_sentinel = ( @@ -128,22 +135,22 @@ def factorize( warnings.warn("size_hint is not applicable for cudf.factorize") if use_na_sentinel is None or use_na_sentinel: - cats = values._column.dropna() + cats = values.dropna() else: - cats = values._column + cats = values cats = cats.unique().astype(values.dtype) if sort: cats = cats.sort_values() - labels = values._column._label_encoding( + labels = values._label_encoding( cats=cats, na_sentinel=Scalar(na_sentinel), dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else Index(cats) + return labels, cats.values if return_cupy_array else as_index(cats) def _linear_interpolation(column, index=None): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 84c16b71997..6e664468644 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5607,7 +5607,7 @@ def quantile( result.name = q return result - result.index = list(map(float, qs)) + result.index = cudf.Index(list(map(float, qs)), dtype="float64") return result @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 56ec9ce0359..de8a5948033 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -13,6 +13,7 @@ List, MutableMapping, Optional, + Sequence, Tuple, Type, Union, @@ -3467,7 +3468,7 @@ def __new__( "tupleize_cols != True is not yet supported" ) - return as_index( + res = as_index( data, copy=copy, dtype=dtype, @@ -3475,6 +3476,15 @@ def __new__( nan_as_null=nan_as_null, **kwargs, ) + if ( + isinstance(data, Sequence) + and not isinstance(data, range) + and len(data) == 0 + and dtype is None + and getattr(data, "dtype", None) is None + ): + return res.astype("str") + return res @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7692d3015f8..a195738af54 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,16 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Sequence, + Set, + Tuple, + Union, +) import cupy import numpy as np @@ -500,6 +509,18 @@ def __init__( copy=False, nan_as_null=True, ): + if ( + isinstance(data, Sequence) + and len(data) == 0 + and dtype is None + and getattr(data, "dtype", None) is None + ): + warnings.warn( + "The default dtype for empty Series will be 'object' instead " + "of 'float64' in a future version. Specify a dtype explicitly " + "to silence this warning.", + FutureWarning, + ) if isinstance(data, pd.Series): if name is None: name = data.name @@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None): 3 NaN dtype: float64 """ - return cls(s, nan_as_null=nan_as_null) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = cls(s, nan_as_null=nan_as_null) + return result @property # type: ignore @_cudf_nvtx_annotate @@ -2642,7 +2666,9 @@ def mode(self, dropna=True): if len(val_counts) > 0: val_counts = val_counts[val_counts == val_counts.iloc[0]] - return Series(val_counts.index.sort_values(), name=self.name) + return Series._from_data( + {self.name: val_counts.index.sort_values()}, name=self.name + ) @_cudf_nvtx_annotate def round(self, decimals=0, how="half_even"): diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index e949f7d78e7..9182246826f 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -397,8 +397,12 @@ def assert_column_memory_ne( raise AssertionError("lhs and rhs holds the same memory.") -def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): - # Wrapper around pd.Series using a float64 default dtype for empty data. +def _create_pandas_series_float64_default( + data=None, index=None, dtype=None, *args, **kwargs +): + # Wrapper around pd.Series using a float64 + # default dtype for empty data to silence warnings. + # TODO: Remove this in pandas-2.0 upgrade if dtype is None and ( data is None or (not is_scalar(data) and len(data) == 0) ): @@ -406,6 +410,19 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) +def _create_cudf_series_float64_default( + data=None, index=None, dtype=None, *args, **kwargs +): + # Wrapper around cudf.Series using a float64 + # default dtype for empty data to silence warnings. + # TODO: Remove this in pandas-2.0 upgrade + if dtype is None and ( + data is None or (not is_scalar(data) and len(data) == 0) + ): + dtype = "float64" + return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs) + + parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b69f22ade81..bc85987c612 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -30,6 +30,7 @@ ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, + _create_cudf_series_float64_default, assert_eq, assert_exceptions_equal, assert_neq, @@ -2000,8 +2001,8 @@ def test_series_shape(): def test_series_shape_empty(): - ps = pd.Series(dtype="float64") - cs = cudf.Series([]) + ps = pd.Series([], dtype="float64") + cs = cudf.Series([], dtype="float64") assert ps.shape == cs.shape @@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = cudf.Series(data, nan_as_null=False) + sr = _create_cudf_series_float64_default(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) @@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe(): def test_as_column_types(): - col = column.as_column(cudf.Series([])) + col = column.as_column(cudf.Series([], dtype="float64")) assert_eq(col.dtype, np.dtype("float64")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="float32") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="str") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="str") assert_eq(col.dtype, np.dtype("object")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) - col = column.as_column(cudf.Series([]), dtype="object") + col = column.as_column(cudf.Series([], dtype="float64"), dtype="object") assert_eq(col.dtype, np.dtype("object")) gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="object")) @@ -4469,7 +4470,7 @@ def test_create_dataframe_column(): ) def test_series_values_host_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.Series(data) + gds = _create_cudf_series_float64_default(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -4492,7 +4493,7 @@ def test_series_values_host_property(data): ) def test_series_values_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.Series(data) + gds = _create_cudf_series_float64_default(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 3277e52edb3..1def6597706 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,11 +1,14 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf -from cudf.testing._utils import _create_pandas_series, assert_eq +from cudf.testing._utils import ( + _create_pandas_series_float64_default, + assert_eq, +) @pytest.mark.parametrize( @@ -22,7 +25,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index f77e7b4d775..ddbfdf5eee2 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -10,7 +10,7 @@ import cudf from cudf import concat from cudf.testing._utils import ( - _create_pandas_series, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, ) @@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = _create_pandas_series(data) + pds = _create_pandas_series_float64_default(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b3791cddce3..29232f63e90 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -30,7 +30,8 @@ SIGNED_INTEGER_TYPES, SIGNED_TYPES, UNSIGNED_TYPES, - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_column_memory_eq, assert_column_memory_ne, assert_eq, @@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(_create_pandas_series(pd_other)) - actual = gd_data.equals(cudf.Series(gd_other)) + expected = pd_data.equals(_create_pandas_series_float64_default(pd_other)) + actual = gd_data.equals(_create_cudf_series_float64_default(gd_other)) assert_eq(expected, actual) expected = pd_data.astype("category").equals(pd_other) @@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ], ) def test_isin_index(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) @@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype): assert_eq(pidx, gidx) +def test_empty_index_init(): + pidx = pd.Index([]) + gidx = cudf.Index([]) + + assert_eq(pidx, gidx) + + @pytest.mark.parametrize( "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] ) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index b4e0983a9e3..43fa83e1735 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -9,7 +9,10 @@ import cudf from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 -from cudf.testing._utils import _create_pandas_series, assert_eq +from cudf.testing._utils import ( + _create_pandas_series_float64_default, + assert_eq, +) from cudf.testing.dataset_generator import rand_dataframe @@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -313,7 +316,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b1e991106ee..cfa571a0f54 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -19,7 +19,8 @@ NUMERIC_TYPES, SERIES_OR_INDEX_NAMES, TIMEDELTA_TYPES, - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -400,8 +401,8 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = _create_pandas_series(data) - gsr = cudf.Series(data) + psr = _create_pandas_series_float64_default(data) + gsr = _create_cudf_series_float64_default(data) assert_eq(psr.size, gsr.size) @@ -487,7 +488,7 @@ def test_series_describe_other_types(ps): ) @pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0]) def test_series_factorize(data, na_sentinel): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas() with pytest.warns(FutureWarning): @@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel): ) @pytest.mark.parametrize("use_na_sentinel", [True, False]) def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize( @@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel): ) @pytest.mark.parametrize("sort", [True, False]) def test_series_factorize_sort(data, sort): - gsr = cudf.Series(data) + gsr = _create_cudf_series_float64_default(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize(sort=sort) @@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): ], dtype="datetime64[ns]", ), - cudf.Series(name="empty series"), + cudf.Series(name="empty series", dtype="float64"), cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"), ], ) @@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method(): def test_set_index_unequal_length(): - s = cudf.Series() + s = cudf.Series(dtype="float64") with pytest.raises(ValueError): s.index = [1, 2, 3] @@ -1682,7 +1683,7 @@ def test_series_nunique_index(data): ], ) def test_axes(data): - csr = cudf.Series(data) + csr = _create_cudf_series_float64_default(data) psr = csr.to_pandas() expected = psr.axes @@ -1760,7 +1761,7 @@ def test_series_truncate_datetimeindex(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = _create_pandas_series(data, index=index) + psr = _create_pandas_series_float64_default(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) @@ -1820,7 +1821,7 @@ def test_fill_new_category(): ], ) def test_isin_datetime(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1849,7 +1850,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1878,7 +1879,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = _create_pandas_series(data) + psr = _create_pandas_series_float64_default(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -2099,7 +2100,7 @@ def test_series_to_dict(into): ], ) def test_series_hasnans(data): - gs = cudf.Series(data, nan_as_null=False) + gs = _create_cudf_series_float64_default(data, nan_as_null=False) ps = gs.to_pandas(nullable=True) assert_eq(gs.hasnans, ps.hasnans) @@ -2170,8 +2171,8 @@ def test_series_init_dict_with_index(data, index): "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] ) def test_series_init_scalar_with_index(data, index): - pandas_series = _create_pandas_series(data, index=index) - cudf_series = cudf.Series(data, index=index) + pandas_series = _create_pandas_series_float64_default(data, index=index) + cudf_series = _create_cudf_series_float64_default(data, index=index) assert_eq( pandas_series, @@ -2313,7 +2314,15 @@ def test_series_round_builtin(data, digits): assert_eq(expected, actual) +def test_series_empty_warning(): + with pytest.warns(FutureWarning): + expected = pd.Series([]) + with pytest.warns(FutureWarning): + actual = cudf.Series([]) + assert_eq(expected, actual) + + def test_series_count_invalid_param(): - s = cudf.Series([]) + s = cudf.Series([], dtype="float64") with pytest.raises(TypeError): s.count(skipna=True) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 463cdb8a7f4..3ac605a1a4d 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -10,7 +10,8 @@ import cudf from cudf.datasets import randomdata from cudf.testing._utils import ( - _create_pandas_series, + _create_cudf_series_float64_default, + _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -222,8 +223,8 @@ def test_approx_quantiles_int(): ) def test_misc_quantiles(data, q): - pdf_series = _create_pandas_series(data) - gdf_series = cudf.Series(data) + pdf_series = _create_pandas_series_float64_default(data) + gdf_series = _create_cudf_series_float64_default(data) expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) actual = gdf_series.quantile(q) @@ -242,7 +243,7 @@ def test_misc_quantiles(data, q): [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -292,7 +293,7 @@ def test_kurt_skew_error(op): [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -348,7 +349,7 @@ def test_series_median(dtype, num_na): np.zeros(100), np.array([1.123, 2.343, np.nan, 0.0]), np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -376,7 +377,7 @@ def test_series_pct_change(data, periods, fill_method): np.array([1.123, 2.343, np.nan, 0.0]), cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -420,7 +421,7 @@ def test_cov1d(data1, data2): np.array([1.123, 2.343, np.nan, 0.0]), cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([]), + cudf.Series([], dtype="float64"), cudf.Series([-3]), ], ) @@ -524,14 +525,14 @@ def test_df_corr(method): ) @pytest.mark.parametrize("skipna", [True, False]) def test_nans_stats(data, ops, skipna): - psr = _create_pandas_series(data) - gsr = cudf.Series(data, nan_as_null=False) + psr = _create_pandas_series_float64_default(data) + gsr = _create_cudf_series_float64_default(data, nan_as_null=False) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - gsr = cudf.Series(data, nan_as_null=False) + gsr = _create_cudf_series_float64_default(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` From eb6d134d169ed077000ee7d075d5363dec066578 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Sep 2023 06:49:14 -1000 Subject: [PATCH 19/23] Don't sort columns for DataFrame init from list of Series (#14136) closes #14132 This PR removes the re-sorting of dataframe columns when initialized by a series list. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14136 --- python/cudf/cudf/core/dataframe.py | 4 +--- python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6e664468644..1a780cc9e9f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7885,9 +7885,7 @@ def _get_union_of_indices(indexes): return indexes[0] else: merged_index = cudf.core.index.GenericIndex._concat(indexes) - merged_index = merged_index.drop_duplicates() - inds = merged_index._values.argsort() - return merged_index.take(inds) + return merged_index.drop_duplicates() def _get_union_of_series_names(series_list): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index bc85987c612..6180162ecdd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -221,6 +221,18 @@ def test_init_unaligned_with_index(): assert_eq(pdf, gdf, check_dtype=False) +def test_init_series_list_columns_unsort(): + pseries = [ + pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + gseries = [ + cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + pdf = pd.DataFrame(pseries) + gdf = cudf.DataFrame(gseries) + assert_eq(pdf, gdf) + + def test_series_basic(): # Make series from buffer a1 = np.arange(10, dtype=np.float64) From 40d4cc5565f600864c3b16f30d3d26fd4904deaf Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 20 Sep 2023 11:03:44 -0700 Subject: [PATCH 20/23] Refactor parquet thrift reader (#14097) Refactors the current `CompactProtocolReader` used to parse parquet file metadata. The main goal of the refactor is to allow easier use of `std::optional` fields in the thrift structs to prevent situations as in #14024 where an optional field is an empty string. The writer cannot distinguish between present-but-empty and not-present, so chooses the latter when writing the field. This PR adds a `ParquetFieldOptional` functor that can wrap the other field functors, obviating the need to write a new optional functor for each type. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/14097 --- .../io/parquet/compact_protocol_reader.cpp | 691 +++++++++++++++--- .../io/parquet/compact_protocol_reader.hpp | 586 +-------------- .../io/parquet/compact_protocol_writer.cpp | 30 +- .../io/parquet/compact_protocol_writer.hpp | 3 + cpp/src/io/parquet/parquet.hpp | 18 +- cpp/src/io/parquet/parquet_common.hpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 38 +- 7 files changed, 662 insertions(+), 706 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index ae11af92f78..5c7b8ca3f8c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -18,27 +18,474 @@ #include #include +#include #include namespace cudf { namespace io { namespace parquet { -uint8_t const CompactProtocolReader::g_list2struct[16] = {0, - 1, - 2, - ST_FLD_BYTE, - ST_FLD_DOUBLE, - 5, - ST_FLD_I16, - 7, - ST_FLD_I32, - 9, - ST_FLD_I64, - ST_FLD_BINARY, - ST_FLD_STRUCT, - ST_FLD_MAP, - ST_FLD_SET, - ST_FLD_LIST}; + +/** + * @brief Base class for parquet field functors. + * + * Holds the field value used by all of the specialized functors. + */ +class parquet_field { + private: + int _field_val; + + protected: + parquet_field(int f) : _field_val(f) {} + + public: + virtual ~parquet_field() = default; + int field() const { return _field_val; } +}; + +/** + * @brief Abstract base class for list functors. + */ +template +class parquet_field_list : public parquet_field { + private: + using read_func_type = std::function; + FieldType _expected_type; + read_func_type _read_value; + + protected: + std::vector& val; + + void bind_read_func(read_func_type fn) { _read_value = fn; } + + parquet_field_list(int f, std::vector& v, FieldType t) + : parquet_field(f), _expected_type(t), val(v) + { + } + + public: + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_LIST) { return true; } + auto const [t, n] = cpr->get_listh(); + if (t != _expected_type) { return true; } + val.resize(n); + for (uint32_t i = 0; i < n; i++) { + if (_read_value(i, cpr)) { return true; } + } + return false; + } +}; + +/** + * @brief Functor to set value to bool read from CompactProtocolReader + * + * bool doesn't actually encode a value, we just use the field type to indicate true/false + * + * @return True if field type is not bool + */ +class parquet_field_bool : public parquet_field { + bool& val; + + public: + parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; } + val = field_type == ST_FLD_TRUE; + return false; + } +}; + +/** + * @brief Functor to read a vector of booleans from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * bool fails + */ +struct parquet_field_bool_list : public parquet_field_list { + parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_TRUE) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const current_byte = cpr->getb(); + if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; } + this->val[i] = current_byte == ST_FLD_TRUE; + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Base type for a functor that reads an integer from CompactProtocolReader + * + * Assuming signed ints since the parquet spec does not use unsigned ints anywhere. + * + * @return True if there is a type mismatch + */ +template +class parquet_field_int : public parquet_field { + static constexpr bool is_byte = std::is_same_v; + + T& val; + + public: + parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if constexpr (is_byte) { + val = cpr->getb(); + } else { + val = cpr->get_zigzag(); + } + return (field_type != EXPECTED_TYPE); + } +}; + +using parquet_field_int8 = parquet_field_int; +using parquet_field_int32 = parquet_field_int; +using parquet_field_int64 = parquet_field_int; + +/** + * @brief Functor to read a vector of integers from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading an + * integer fails + */ +template +struct parquet_field_int_list : public parquet_field_list { + parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v, EXPECTED_TYPE) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + this->val[i] = cpr->get_zigzag(); + return false; + }; + this->bind_read_func(read_value); + } +}; + +using parquet_field_int64_list = parquet_field_int_list; + +/** + * @brief Functor to read a string from CompactProtocolReader + * + * @return True if field type mismatches or if size of string exceeds bounds + * of the CompactProtocolReader + */ +class parquet_field_string : public parquet_field { + std::string& val; + + public: + parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_BINARY) { return true; } + auto const n = cpr->get_u32(); + if (n < static_cast(cpr->m_end - cpr->m_cur)) { + val.assign(reinterpret_cast(cpr->m_cur), n); + cpr->m_cur += n; + return false; + } else { + return true; + } + } +}; + +/** + * @brief Functor to read a vector of strings from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * string fails + */ +struct parquet_field_string_list : public parquet_field_list { + parquet_field_string_list(int f, std::vector& v) + : parquet_field_list(f, v, ST_FLD_BINARY) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const l = cpr->get_u32(); + if (l < static_cast(cpr->m_end - cpr->m_cur)) { + this->val[i].assign(reinterpret_cast(cpr->m_cur), l); + cpr->m_cur += l; + } else { + return true; + } + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Functor to set value to enum read from CompactProtocolReader + * + * @return True if field type is not int32 + */ +template +class parquet_field_enum : public parquet_field { + Enum& val; + + public: + parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + val = static_cast(cpr->get_i32()); + return (field_type != ST_FLD_I32); + } +}; + +/** + * @brief Functor to read a vector of enums from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading an + * enum fails + */ +template +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_I32) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + this->val[i] = static_cast(cpr->get_i32()); + return false; + }; + this->bind_read_func(read_value); + } +}; + +/** + * @brief Functor to read a structure from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * struct fails + */ +template +class parquet_field_struct : public parquet_field { + T& val; + + public: + parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); + } +}; + +/** + * @brief Functor to read optional structures in unions + * + * @return True if field types mismatch + */ +template +class parquet_field_union_struct : public parquet_field { + E& enum_val; + thrust::optional& val; // union structs are always wrapped in std::optional + + public: + parquet_field_union_struct(int f, E& ev, thrust::optional& v) + : parquet_field(f), enum_val(ev), val(v) + { + } + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + T v; + bool const res = parquet_field_struct(field(), v).operator()(cpr, field_type); + if (!res) { + val = v; + enum_val = static_cast(field()); + } + return res; + } +}; + +/** + * @brief Functor to read empty structures in unions + * + * Added to avoid having to define read() functions for empty structs contained in unions. + * + * @return True if field types mismatch + */ +template +class parquet_field_union_enumerator : public parquet_field { + E& val; + + public: + parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { return true; } + cpr->skip_struct_field(field_type); + val = static_cast(field()); + return false; + } +}; + +/** + * @brief Functor to read a vector of structures from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * struct fails + */ +template +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_STRUCT) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + if (not cpr->read(&this->val[i])) { return true; } + return false; + }; + this->bind_read_func(read_value); + } +}; + +// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields +// in a struct +/** + * @brief Functor to read a union member from CompactProtocolReader + * + * @tparam is_empty True if tparam `T` type is empty type, else false. + * + * @return True if field types mismatch or if the process of reading a + * union member fails + */ +template +class ParquetFieldUnionFunctor : public parquet_field { + bool& is_set; + T& val; + + public: + ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { + return true; + } else { + is_set = true; + return !cpr->read(&val); + } + } +}; + +template +class ParquetFieldUnionFunctor : public parquet_field { + bool& is_set; + T& val; + + public: + ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { + return true; + } else { + is_set = true; + cpr->skip_struct_field(field_type); + return false; + } + } +}; + +template +ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) +{ + return ParquetFieldUnionFunctor>(f, b, v); +} + +/** + * @brief Functor to read a binary from CompactProtocolReader + * + * @return True if field type mismatches or if size of binary exceeds bounds + * of the CompactProtocolReader + */ +class parquet_field_binary : public parquet_field { + std::vector& val; + + public: + parquet_field_binary(int f, std::vector& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_BINARY) { return true; } + auto const n = cpr->get_u32(); + if (n <= static_cast(cpr->m_end - cpr->m_cur)) { + val.resize(n); + val.assign(cpr->m_cur, cpr->m_cur + n); + cpr->m_cur += n; + return false; + } else { + return true; + } + } +}; + +/** + * @brief Functor to read a vector of binaries from CompactProtocolReader + * + * @return True if field types mismatch or if the process of reading a + * binary fails + */ +struct parquet_field_binary_list : public parquet_field_list> { + parquet_field_binary_list(int f, std::vector>& v) + : parquet_field_list(f, v, ST_FLD_BINARY) + { + auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { + auto const l = cpr->get_u32(); + if (l <= static_cast(cpr->m_end - cpr->m_cur)) { + val[i].resize(l); + val[i].assign(cpr->m_cur, cpr->m_cur + l); + cpr->m_cur += l; + } else { + return true; + } + return false; + }; + bind_read_func(read_value); + } +}; + +/** + * @brief Functor to read a struct from CompactProtocolReader + * + * @return True if field type mismatches + */ +class parquet_field_struct_blob : public parquet_field { + std::vector& val; + + public: + parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + if (field_type != ST_FLD_STRUCT) { return true; } + uint8_t const* const start = cpr->m_cur; + cpr->skip_struct_field(field_type); + if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } + return false; + } +}; + +/** + * @brief functor to wrap functors for optional fields + */ +template +class parquet_field_optional : public parquet_field { + thrust::optional& val; + + public: + parquet_field_optional(int f, thrust::optional& v) : parquet_field(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + T v; + bool const res = FieldFunctor(field(), v).operator()(cpr, field_type); + if (!res) { val = v; } + return res; + } +}; /** * @brief Skips the number of bytes according to the specified struct type @@ -59,22 +506,21 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) case ST_FLD_BYTE: skip_bytes(1); break; case ST_FLD_DOUBLE: skip_bytes(8); break; case ST_FLD_BINARY: skip_bytes(get_u32()); break; - case ST_FLD_LIST: + case ST_FLD_LIST: [[fallthrough]]; case ST_FLD_SET: { - int c = getb(); - int n = c >> 4; - if (n == 0xf) n = get_i32(); - t = g_list2struct[c & 0xf]; - if (depth > 10) return false; - for (int32_t i = 0; i < n; i++) + auto const [t, n] = get_listh(); + if (depth > 10) { return false; } + for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); + } } break; case ST_FLD_STRUCT: for (;;) { - int c = getb(); - t = c & 0xf; - if (!c) break; - if (depth > 10) return false; + int const c = getb(); + t = c & 0xf; + if (c == 0) { break; } // end of struct + if ((c & 0xf0) == 0) { get_i16(); } // field id is not a delta + if (depth > 10) { return false; } skip_struct_field(t, depth + 1); } break; @@ -125,11 +571,11 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple int field = 0; while (true) { int const current_byte = cpr->getb(); - if (!current_byte) break; - int const field_delta = current_byte >> 4; - int const field_type = current_byte & 0xf; - field = field_delta ? field + field_delta : cpr->get_i16(); - bool exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); + if (!current_byte) { break; } + int const field_delta = current_byte >> 4; + int const field_type = current_byte & 0xf; + field = field_delta ? field + field_delta : cpr->get_i16(); + bool const exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); if (exit_function) { return false; } } return true; @@ -137,27 +583,30 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple bool CompactProtocolReader::read(FileMetaData* f) { - auto op = std::make_tuple(ParquetFieldInt32(1, f->version), - ParquetFieldStructList(2, f->schema), - ParquetFieldInt64(3, f->num_rows), - ParquetFieldStructList(4, f->row_groups), - ParquetFieldStructList(5, f->key_value_metadata), - ParquetFieldString(6, f->created_by)); + using optional_list_column_order = + parquet_field_optional, parquet_field_struct_list>; + auto op = std::make_tuple(parquet_field_int32(1, f->version), + parquet_field_struct_list(2, f->schema), + parquet_field_int64(3, f->num_rows), + parquet_field_struct_list(4, f->row_groups), + parquet_field_struct_list(5, f->key_value_metadata), + parquet_field_string(6, f->created_by), + optional_list_column_order(7, f->column_orders)); return function_builder(this, op); } bool CompactProtocolReader::read(SchemaElement* s) { - auto op = std::make_tuple(ParquetFieldEnum(1, s->type), - ParquetFieldInt32(2, s->type_length), - ParquetFieldEnum(3, s->repetition_type), - ParquetFieldString(4, s->name), - ParquetFieldInt32(5, s->num_children), - ParquetFieldEnum(6, s->converted_type), - ParquetFieldInt32(7, s->decimal_scale), - ParquetFieldInt32(8, s->decimal_precision), - ParquetFieldOptionalInt32(9, s->field_id), - ParquetFieldStruct(10, s->logical_type)); + auto op = std::make_tuple(parquet_field_enum(1, s->type), + parquet_field_int32(2, s->type_length), + parquet_field_enum(3, s->repetition_type), + parquet_field_string(4, s->name), + parquet_field_int32(5, s->num_children), + parquet_field_enum(6, s->converted_type), + parquet_field_int32(7, s->decimal_scale), + parquet_field_int32(8, s->decimal_precision), + parquet_field_optional(9, s->field_id), + parquet_field_struct(10, s->logical_type)); return function_builder(this, op); } @@ -181,21 +630,21 @@ bool CompactProtocolReader::read(LogicalType* l) bool CompactProtocolReader::read(DecimalType* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision)); + auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision)); return function_builder(this, op); } bool CompactProtocolReader::read(TimeType* t) { auto op = - std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); + std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); return function_builder(this, op); } bool CompactProtocolReader::read(TimestampType* t) { auto op = - std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); + std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); return function_builder(this, op); } @@ -209,123 +658,129 @@ bool CompactProtocolReader::read(TimeUnit* u) bool CompactProtocolReader::read(IntType* i) { - auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned)); + auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned)); return function_builder(this, op); } bool CompactProtocolReader::read(RowGroup* r) { - auto op = std::make_tuple(ParquetFieldStructList(1, r->columns), - ParquetFieldInt64(2, r->total_byte_size), - ParquetFieldInt64(3, r->num_rows)); + auto op = std::make_tuple(parquet_field_struct_list(1, r->columns), + parquet_field_int64(2, r->total_byte_size), + parquet_field_int64(3, r->num_rows)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnChunk* c) { - auto op = std::make_tuple(ParquetFieldString(1, c->file_path), - ParquetFieldInt64(2, c->file_offset), - ParquetFieldStruct(3, c->meta_data), - ParquetFieldInt64(4, c->offset_index_offset), - ParquetFieldInt32(5, c->offset_index_length), - ParquetFieldInt64(6, c->column_index_offset), - ParquetFieldInt32(7, c->column_index_length)); + auto op = std::make_tuple(parquet_field_string(1, c->file_path), + parquet_field_int64(2, c->file_offset), + parquet_field_struct(3, c->meta_data), + parquet_field_int64(4, c->offset_index_offset), + parquet_field_int32(5, c->offset_index_length), + parquet_field_int64(6, c->column_index_offset), + parquet_field_int32(7, c->column_index_length)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnChunkMetaData* c) { - auto op = std::make_tuple(ParquetFieldEnum(1, c->type), - ParquetFieldEnumList(2, c->encodings), - ParquetFieldStringList(3, c->path_in_schema), - ParquetFieldEnum(4, c->codec), - ParquetFieldInt64(5, c->num_values), - ParquetFieldInt64(6, c->total_uncompressed_size), - ParquetFieldInt64(7, c->total_compressed_size), - ParquetFieldInt64(9, c->data_page_offset), - ParquetFieldInt64(10, c->index_page_offset), - ParquetFieldInt64(11, c->dictionary_page_offset), - ParquetFieldStruct(12, c->statistics)); + auto op = std::make_tuple(parquet_field_enum(1, c->type), + parquet_field_enum_list(2, c->encodings), + parquet_field_string_list(3, c->path_in_schema), + parquet_field_enum(4, c->codec), + parquet_field_int64(5, c->num_values), + parquet_field_int64(6, c->total_uncompressed_size), + parquet_field_int64(7, c->total_compressed_size), + parquet_field_int64(9, c->data_page_offset), + parquet_field_int64(10, c->index_page_offset), + parquet_field_int64(11, c->dictionary_page_offset), + parquet_field_struct(12, c->statistics)); return function_builder(this, op); } bool CompactProtocolReader::read(PageHeader* p) { - auto op = std::make_tuple(ParquetFieldEnum(1, p->type), - ParquetFieldInt32(2, p->uncompressed_page_size), - ParquetFieldInt32(3, p->compressed_page_size), - ParquetFieldStruct(5, p->data_page_header), - ParquetFieldStruct(7, p->dictionary_page_header), - ParquetFieldStruct(8, p->data_page_header_v2)); + auto op = std::make_tuple(parquet_field_enum(1, p->type), + parquet_field_int32(2, p->uncompressed_page_size), + parquet_field_int32(3, p->compressed_page_size), + parquet_field_struct(5, p->data_page_header), + parquet_field_struct(7, p->dictionary_page_header), + parquet_field_struct(8, p->data_page_header_v2)); return function_builder(this, op); } bool CompactProtocolReader::read(DataPageHeader* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldEnum(2, d->encoding), - ParquetFieldEnum(3, d->definition_level_encoding), - ParquetFieldEnum(4, d->repetition_level_encoding)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_enum(2, d->encoding), + parquet_field_enum(3, d->definition_level_encoding), + parquet_field_enum(4, d->repetition_level_encoding)); return function_builder(this, op); } bool CompactProtocolReader::read(DictionaryPageHeader* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldEnum(2, d->encoding)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_enum(2, d->encoding)); return function_builder(this, op); } bool CompactProtocolReader::read(DataPageHeaderV2* d) { - auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), - ParquetFieldInt32(2, d->num_nulls), - ParquetFieldInt32(3, d->num_rows), - ParquetFieldEnum(4, d->encoding), - ParquetFieldInt32(5, d->definition_levels_byte_length), - ParquetFieldInt32(6, d->repetition_levels_byte_length), - ParquetFieldBool(7, d->is_compressed)); + auto op = std::make_tuple(parquet_field_int32(1, d->num_values), + parquet_field_int32(2, d->num_nulls), + parquet_field_int32(3, d->num_rows), + parquet_field_enum(4, d->encoding), + parquet_field_int32(5, d->definition_levels_byte_length), + parquet_field_int32(6, d->repetition_levels_byte_length), + parquet_field_bool(7, d->is_compressed)); return function_builder(this, op); } bool CompactProtocolReader::read(KeyValue* k) { - auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value)); + auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value)); return function_builder(this, op); } bool CompactProtocolReader::read(PageLocation* p) { - auto op = std::make_tuple(ParquetFieldInt64(1, p->offset), - ParquetFieldInt32(2, p->compressed_page_size), - ParquetFieldInt64(3, p->first_row_index)); + auto op = std::make_tuple(parquet_field_int64(1, p->offset), + parquet_field_int32(2, p->compressed_page_size), + parquet_field_int64(3, p->first_row_index)); return function_builder(this, op); } bool CompactProtocolReader::read(OffsetIndex* o) { - auto op = std::make_tuple(ParquetFieldStructList(1, o->page_locations)); + auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations)); return function_builder(this, op); } bool CompactProtocolReader::read(ColumnIndex* c) { - auto op = std::make_tuple(ParquetFieldBoolList(1, c->null_pages), - ParquetFieldBinaryList(2, c->min_values), - ParquetFieldBinaryList(3, c->max_values), - ParquetFieldEnum(4, c->boundary_order), - ParquetFieldInt64List(5, c->null_counts)); + auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages), + parquet_field_binary_list(2, c->min_values), + parquet_field_binary_list(3, c->max_values), + parquet_field_enum(4, c->boundary_order), + parquet_field_int64_list(5, c->null_counts)); return function_builder(this, op); } bool CompactProtocolReader::read(Statistics* s) { - auto op = std::make_tuple(ParquetFieldBinary(1, s->max), - ParquetFieldBinary(2, s->min), - ParquetFieldInt64(3, s->null_count), - ParquetFieldInt64(4, s->distinct_count), - ParquetFieldBinary(5, s->max_value), - ParquetFieldBinary(6, s->min_value)); + auto op = std::make_tuple(parquet_field_binary(1, s->max), + parquet_field_binary(2, s->min), + parquet_field_int64(3, s->null_count), + parquet_field_int64(4, s->distinct_count), + parquet_field_binary(5, s->max_value), + parquet_field_binary(6, s->min_value)); + return function_builder(this, op); +} + +bool CompactProtocolReader::read(ColumnOrder* c) +{ + auto op = std::make_tuple(parquet_field_union_enumerator(1, c->type)); return function_builder(this, op); } @@ -338,7 +793,7 @@ bool CompactProtocolReader::read(Statistics* s) */ bool CompactProtocolReader::InitSchema(FileMetaData* md) { - if (static_cast(WalkSchema(md)) != md->schema.size()) return false; + if (static_cast(WalkSchema(md)) != md->schema.size()) { return false; } /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains @@ -353,13 +808,15 @@ bool CompactProtocolReader::InitSchema(FileMetaData* md) for (auto const& path : column.meta_data.path_in_schema) { auto const it = [&] { // find_if starting at (current_schema_index + 1) and then wrapping - auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; }; - auto mid = md->schema.cbegin() + current_schema_index + 1; - auto it = std::find_if(mid, md->schema.cend(), schema); - if (it != md->schema.cend()) return it; + auto const schema = [&](auto const& e) { + return e.parent_idx == parent && e.name == path; + }; + auto const mid = md->schema.cbegin() + current_schema_index + 1; + auto const it = std::find_if(mid, md->schema.cend(), schema); + if (it != md->schema.cend()) { return it; } return std::find_if(md->schema.cbegin(), mid, schema); }(); - if (it == md->schema.cend()) return false; + if (it == md->schema.cend()) { return false; } current_schema_index = std::distance(md->schema.cbegin(), it); column.schema_idx = current_schema_index; parent = current_schema_index; @@ -401,9 +858,9 @@ int CompactProtocolReader::WalkSchema( if (e->num_children > 0) { for (int i = 0; i < e->num_children; i++) { e->children_idx.push_back(idx); - int idx_old = idx; - idx = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level); - if (idx <= idx_old) break; // Error + int const idx_old = idx; + idx = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level); + if (idx <= idx_old) { break; } // Error } } return idx; diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 62ccacaac37..619815db503 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace cudf { @@ -40,9 +41,6 @@ namespace parquet { * compression codecs are supported yet. */ class CompactProtocolReader { - protected: - static const uint8_t g_list2struct[16]; - public: explicit CompactProtocolReader(uint8_t const* base = nullptr, size_t len = 0) { init(base, len); } void init(uint8_t const* base, size_t len) @@ -57,45 +55,46 @@ class CompactProtocolReader { bytecnt = std::min(bytecnt, (size_t)(m_end - m_cur)); m_cur += bytecnt; } - uint32_t get_u32() noexcept + + // returns a varint encoded integer + template + T get_varint() noexcept { - uint32_t v = 0; + T v = 0; for (uint32_t l = 0;; l += 7) { - uint32_t c = getb(); + T c = getb(); v |= (c & 0x7f) << l; - if (c < 0x80) break; + if (c < 0x80) { break; } } return v; } - uint64_t get_u64() noexcept - { - uint64_t v = 0; - for (uint64_t l = 0;; l += 7) { - uint64_t c = getb(); - v |= (c & 0x7f) << l; - if (c < 0x80) break; - } - return v; - } - int32_t get_i16() noexcept { return get_i32(); } - int32_t get_i32() noexcept - { - uint32_t u = get_u32(); - return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1)); - } - int64_t get_i64() noexcept + + // returns a zigzag encoded signed integer + template + T get_zigzag() noexcept { - uint64_t u = get_u64(); - return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1)); + using U = std::make_unsigned_t; + U const u = get_varint(); + return static_cast((u >> 1u) ^ -static_cast(u & 1)); } - int32_t get_listh(uint8_t* el_type) noexcept + + // thrift spec says to use zigzag i32 for i16 types + int32_t get_i16() noexcept { return get_zigzag(); } + int32_t get_i32() noexcept { return get_zigzag(); } + int64_t get_i64() noexcept { return get_zigzag(); } + + uint32_t get_u32() noexcept { return get_varint(); } + uint64_t get_u64() noexcept { return get_varint(); } + + [[nodiscard]] std::pair get_listh() noexcept { - uint32_t c = getb(); - int32_t sz = c >> 4; - *el_type = c & 0xf; - if (sz == 0xf) sz = get_u32(); - return sz; + uint32_t const c = getb(); + uint32_t sz = c >> 4; + uint8_t t = c & 0xf; + if (sz == 0xf) { sz = get_u32(); } + return {t, sz}; } + bool skip_struct_field(int t, int depth = 0); public: @@ -120,6 +119,7 @@ class CompactProtocolReader { bool read(OffsetIndex* o); bool read(ColumnIndex* c); bool read(Statistics* s); + bool read(ColumnOrder* c); public: static int NumRequiredBits(uint32_t max_level) noexcept @@ -140,523 +140,11 @@ class CompactProtocolReader { uint8_t const* m_cur = nullptr; uint8_t const* m_end = nullptr; - friend class ParquetFieldBool; - friend class ParquetFieldBoolList; - friend class ParquetFieldInt8; - friend class ParquetFieldInt32; - friend class ParquetFieldOptionalInt32; - friend class ParquetFieldInt64; - friend class ParquetFieldInt64List; - template - friend class ParquetFieldStructListFunctor; - friend class ParquetFieldString; - template - friend class ParquetFieldStructFunctor; - template - friend class ParquetFieldUnionFunctor; - template - friend class ParquetFieldEnum; - template - friend class ParquetFieldEnumListFunctor; - friend class ParquetFieldStringList; - friend class ParquetFieldBinary; - friend class ParquetFieldBinaryList; - friend class ParquetFieldStructBlob; -}; - -/** - * @brief Functor to set value to bool read from CompactProtocolReader - * - * @return True if field type is not bool - */ -class ParquetFieldBool { - int field_val; - bool& val; - - public: - ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) || - !(val = (field_type == ST_FLD_TRUE), true); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of booleans from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * bool fails - */ -class ParquetFieldBoolList { - int field_val; - std::vector& val; - - public: - ParquetFieldBoolList(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_TRUE) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - unsigned int current_byte = cpr->getb(); - if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) return true; - val[i] = current_byte == ST_FLD_TRUE; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 8 bit integer read from CompactProtocolReader - * - * @return True if field type is not int8 - */ -class ParquetFieldInt8 { - int field_val; - int8_t& val; - - public: - ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->getb(); - return (field_type != ST_FLD_BYTE); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 32 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -class ParquetFieldInt32 { - int field_val; - int32_t& val; - - public: - ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i32(); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -class ParquetFieldOptionalInt32 { - int field_val; - std::optional& val; - - public: - ParquetFieldOptionalInt32(int f, std::optional& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i32(); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to set value to 64 bit integer read from CompactProtocolReader - * - * @return True if field type is not int32 or int64 - */ -class ParquetFieldInt64 { - int field_val; - int64_t& val; - - public: - ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = cpr->get_i64(); - return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of 64-bit integers from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading an - * int64 fails - */ -class ParquetFieldInt64List { - int field_val; - std::vector& val; - - public: - ParquetFieldInt64List(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_I64) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - val[i] = cpr->get_i64(); - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of structures from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * struct fails - */ -template -class ParquetFieldStructListFunctor { - int field_val; - std::vector& val; - - public: - ParquetFieldStructListFunctor(int f, std::vector& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - - int current_byte = cpr->getb(); - if ((current_byte & 0xf) != ST_FLD_STRUCT) return true; - int n = current_byte >> 4; - if (n == 0xf) n = cpr->get_u32(); - val.resize(n); - for (int32_t i = 0; i < n; i++) { - if (!(cpr->read(&val[i]))) { return true; } - } - - return false; - } - - int field() { return field_val; } -}; - -template -ParquetFieldStructListFunctor ParquetFieldStructList(int f, std::vector& v) -{ - return ParquetFieldStructListFunctor(f, v); -} - -/** - * @brief Functor to read a string from CompactProtocolReader - * - * @return True if field type mismatches or if size of string exceeds bounds - * of the CompactProtocolReader - */ -class ParquetFieldString { - int field_val; - std::string& val; - - public: - ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_BINARY) return true; - uint32_t n = cpr->get_u32(); - if (n < (size_t)(cpr->m_end - cpr->m_cur)) { - val.assign((char const*)cpr->m_cur, n); - cpr->m_cur += n; - return false; - } else { - return true; - } - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a structure from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * struct fails - */ -template -class ParquetFieldStructFunctor { - int field_val; - T& val; - - public: - ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); - } - - int field() { return field_val; } -}; - -template -ParquetFieldStructFunctor ParquetFieldStruct(int f, T& v) -{ - return ParquetFieldStructFunctor(f, v); -} - -/** - * @brief Functor to read a union member from CompactProtocolReader - * - * @tparam is_empty True if tparam `T` type is empty type, else false. - * - * @return True if field types mismatch or if the process of reading a - * union member fails - */ -template -class ParquetFieldUnionFunctor { - int field_val; - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - return !cpr->read(&val); - } - } - - int field() { return field_val; } -}; - -template -struct ParquetFieldUnionFunctor { - int field_val; - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - cpr->skip_struct_field(field_type); - return false; - } - } - - int field() { return field_val; } -}; - -template -ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) -{ - return ParquetFieldUnionFunctor>(f, b, v); -} - -/** - * @brief Functor to set value to enum read from CompactProtocolReader - * - * @return True if field type is not int32 - */ -template -class ParquetFieldEnum { - int field_val; - Enum& val; - - public: - ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - val = static_cast(cpr->get_i32()); - return (field_type != ST_FLD_I32); - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of enums from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading an - * enum fails - */ -template -class ParquetFieldEnumListFunctor { - int field_val; - std::vector& val; - - public: - ParquetFieldEnumListFunctor(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - int current_byte = cpr->getb(); - if ((current_byte & 0xf) != ST_FLD_I32) return true; - int n = current_byte >> 4; - if (n == 0xf) n = cpr->get_u32(); - val.resize(n); - for (int32_t i = 0; i < n; i++) { - val[i] = static_cast(cpr->get_i32()); - } - return false; - } - - int field() { return field_val; } -}; - -template -ParquetFieldEnumListFunctor ParquetFieldEnumList(int field, std::vector& v) -{ - return ParquetFieldEnumListFunctor(field, v); -} - -/** - * @brief Functor to read a vector of strings from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * string fails - */ -class ParquetFieldStringList { - int field_val; - std::vector& val; - - public: - ParquetFieldStringList(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_BINARY) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - uint32_t l = cpr->get_u32(); - if (l < (size_t)(cpr->m_end - cpr->m_cur)) { - val[i].assign((char const*)cpr->m_cur, l); - cpr->m_cur += l; - } else - return true; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a binary from CompactProtocolReader - * - * @return True if field type mismatches or if size of binary exceeds bounds - * of the CompactProtocolReader - */ -class ParquetFieldBinary { - int field_val; - std::vector& val; - - public: - ParquetFieldBinary(int f, std::vector& v) : field_val(f), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_BINARY) return true; - uint32_t n = cpr->get_u32(); - if (n <= (size_t)(cpr->m_end - cpr->m_cur)) { - val.resize(n); - val.assign(cpr->m_cur, cpr->m_cur + n); - cpr->m_cur += n; - return false; - } else { - return true; - } - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a vector of binaries from CompactProtocolReader - * - * @return True if field types mismatch or if the process of reading a - * binary fails - */ -class ParquetFieldBinaryList { - int field_val; - std::vector>& val; - - public: - ParquetFieldBinaryList(int f, std::vector>& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_LIST) return true; - uint8_t t; - int32_t n = cpr->get_listh(&t); - if (t != ST_FLD_BINARY) return true; - val.resize(n); - for (int32_t i = 0; i < n; i++) { - uint32_t l = cpr->get_u32(); - if (l <= (size_t)(cpr->m_end - cpr->m_cur)) { - val[i].resize(l); - val[i].assign(cpr->m_cur, cpr->m_cur + l); - cpr->m_cur += l; - } else - return true; - } - return false; - } - - int field() { return field_val; } -}; - -/** - * @brief Functor to read a struct from CompactProtocolReader - * - * @return True if field type mismatches - */ -class ParquetFieldStructBlob { - int field_val; - std::vector& val; - - public: - ParquetFieldStructBlob(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) return true; - uint8_t const* start = cpr->m_cur; - cpr->skip_struct_field(field_type); - if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } - return false; - } - - int field() { return field_val; } + friend class parquet_field_string; + friend class parquet_field_string_list; + friend class parquet_field_binary; + friend class parquet_field_binary_list; + friend class parquet_field_struct_blob; }; } // namespace parquet diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index b2c0c97c52d..60bc8984d81 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -33,18 +33,7 @@ size_t CompactProtocolWriter::write(FileMetaData const& f) c.field_struct_list(4, f.row_groups); if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); } if (not f.created_by.empty()) { c.field_string(6, f.created_by); } - if (f.column_order_listsize != 0) { - // Dummy list of struct containing an empty field1 struct - c.put_field_header(7, c.current_field(), ST_FLD_LIST); - c.put_byte((uint8_t)((std::min(f.column_order_listsize, 0xfu) << 4) | ST_FLD_STRUCT)); - if (f.column_order_listsize >= 0xf) c.put_uint(f.column_order_listsize); - for (uint32_t i = 0; i < f.column_order_listsize; i++) { - c.put_field_header(1, 0, ST_FLD_STRUCT); - c.put_byte(0); // ColumnOrder.field1 struct end - c.put_byte(0); // ColumnOrder struct end - } - c.set_current_field(7); - } + if (f.column_orders.has_value()) { c.field_struct_list(7, f.column_orders.value()); } return c.value(); } @@ -233,6 +222,16 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s) return c.value(); } +size_t CompactProtocolWriter::write(ColumnOrder const& co) +{ + CompactProtocolFieldWriter c(*this); + switch (co) { + case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break; + default: break; + } + return c.value(); +} + void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); } void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len) @@ -320,6 +319,13 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) current_field_value = field; } +inline void CompactProtocolFieldWriter::field_empty_struct(int field) +{ + put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_byte(0); // add a stop field + current_field_value = field; +} + template inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector const& val) { diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 8d7b0961934..26d66527aa5 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -53,6 +53,7 @@ class CompactProtocolWriter { size_t write(Statistics const&); size_t write(PageLocation const&); size_t write(OffsetIndex const&); + size_t write(ColumnOrder const&); protected: std::vector& m_buf; @@ -94,6 +95,8 @@ class CompactProtocolFieldWriter { template inline void field_struct(int field, T const& val); + inline void field_empty_struct(int field); + template inline void field_struct_list(int field, std::vector const& val); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index f7318bb9935..c2affc774c2 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,6 +18,8 @@ #include "parquet_common.hpp" +#include + #include #include #include @@ -118,6 +120,16 @@ struct LogicalType { BsonType BSON; }; +/** + * Union to specify the order used for the min_value and max_value fields for a column. + */ +struct ColumnOrder { + enum Type { UNDEFINED, TYPE_ORDER }; + Type type; + + operator Type() const { return type; } +}; + /** * @brief Struct for describing an element/field in the Parquet format schema * @@ -135,7 +147,7 @@ struct SchemaElement { int32_t num_children = 0; int32_t decimal_scale = 0; int32_t decimal_precision = 0; - std::optional field_id = std::nullopt; + thrust::optional field_id = thrust::nullopt; bool output_as_byte_array = false; // The following fields are filled in later during schema initialization @@ -284,8 +296,8 @@ struct FileMetaData { int64_t num_rows = 0; std::vector row_groups; std::vector key_value_metadata; - std::string created_by = ""; - uint32_t column_order_listsize = 0; + std::string created_by = ""; + thrust::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5f8f1617cb9..5a1716bb547 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -141,7 +141,7 @@ enum BoundaryOrder { /** * @brief Thrift compact protocol struct field types */ -enum { +enum FieldType { ST_FLD_TRUE = 1, ST_FLD_FALSE = 2, ST_FLD_BYTE = 3, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index d2976a3f5d9..a124f352ee4 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -74,8 +74,11 @@ struct aggregate_writer_metadata { for (size_t i = 0; i < partitions.size(); ++i) { this->files[i].num_rows = partitions[i].num_rows; } - this->column_order_listsize = - (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0; + + if (stats_granularity != statistics_freq::STATISTICS_NONE) { + ColumnOrder default_order = {ColumnOrder::TYPE_ORDER}; + this->column_orders = std::vector(num_columns, default_order); + } for (size_t p = 0; p < kv_md.size(); ++p) { std::transform(kv_md[p].begin(), @@ -102,13 +105,13 @@ struct aggregate_writer_metadata { { CUDF_EXPECTS(part < files.size(), "Invalid part index queried"); FileMetaData meta{}; - meta.version = this->version; - meta.schema = this->schema; - meta.num_rows = this->files[part].num_rows; - meta.row_groups = this->files[part].row_groups; - meta.key_value_metadata = this->files[part].key_value_metadata; - meta.created_by = this->created_by; - meta.column_order_listsize = this->column_order_listsize; + meta.version = this->version; + meta.schema = this->schema; + meta.num_rows = this->files[part].num_rows; + meta.row_groups = this->files[part].row_groups; + meta.key_value_metadata = this->files[part].key_value_metadata; + meta.created_by = this->created_by; + meta.column_orders = this->column_orders; return meta; } @@ -170,8 +173,8 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = ""; - uint32_t column_order_listsize = 0; + std::string created_by = ""; + thrust::optional> column_orders = thrust::nullopt; }; namespace { @@ -2373,20 +2376,7 @@ std::unique_ptr> writer::merge_row_group_metadata( md.num_rows += tmp.num_rows; } } - // Reader doesn't currently populate column_order, so infer it here - if (not md.row_groups.empty()) { - auto const is_valid_stats = [](auto const& stats) { - return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 || - stats.distinct_count != -1 || not stats.max_value.empty() || - not stats.min_value.empty(); - }; - uint32_t num_columns = static_cast(md.row_groups[0].columns.size()); - md.column_order_listsize = - (num_columns > 0 && is_valid_stats(md.row_groups[0].columns[0].meta_data.statistics)) - ? num_columns - : 0; - } // Thrift-encode the resulting output file_header_s fhdr; file_ender_s fendr; From e87d2fc1df6105d802b300bad19a9937f8155613 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 20 Sep 2023 21:18:31 +0100 Subject: [PATCH 21/23] Reduce memory usage of as_categorical_column (#14138) The main culprit is in the way the codes returned from _label_encoding were being ordered. We were generating an int64 column for the order, gathering through the left gather map, and then argsorting, before using that ordering as a gather map for the codes. We note that gather(y, with=argsort(x)) is equivalent to sort_by_key(y, with=x) so use that instead (avoiding an unnecessary gather). Furthermore we also note that gather([0..n), with=x) is just equivalent to x, so we can avoid a gather too. This reduces the peak memory footprint of categorifying a random column of 500_000_000 int32 values where there are 100 unique values from 24.75 GiB to 11.67 GiB. ### Test code ```python import cudf import cupy as cp K = 100 N = 500_000_000 rng = cp.random._generator.RandomState() column = cudf.core.column.as_column(rng.choice(cp.arange(K, dtype="int32"), size=(N,), replace=True)) column = column.astype("category", ordered=False) ``` ### Before ![Screenshot from 2023-09-20 14-49-27](https://github.com/rapidsai/cudf/assets/1126981/08782501-c233-4efd-b4d6-a378cea82a82) ### After ![Screenshot from 2023-09-20 14-49-42](https://github.com/rapidsai/cudf/assets/1126981/93193bfb-a93e-45bf-8e5a-24289efc77c4) Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/14138 --- python/cudf/cudf/core/column/column.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d2e2f11a12e..0bc50a521e2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1390,20 +1390,19 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - codes = arange(len(cats), dtype=dtype) left_gather_map, right_gather_map = cpp_join( [self], [cats], how="left" ) - codes = codes.take( - right_gather_map, nullify=True, check_bounds=False - ).fillna(na_sentinel.value) - + codes = libcudf.copying.gather( + [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True + ) + del right_gather_map # reorder `codes` so that its values correspond to the # values of `self`: - order = arange(len(self)) - order = order.take(left_gather_map, check_bounds=False).argsort() - codes = codes.take(order) - return codes + (codes,) = libcudf.sort.sort_by_key( + codes, [left_gather_map], [True], ["last"], stable=True + ) + return codes.fillna(na_sentinel.value) def column_empty_like( From fe99e4baa3a7cd0f87658bf1ea77b17ec61fd7dc Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:42:32 -0400 Subject: [PATCH 22/23] Expose stream parameter in public strings find APIs (#14060) Add stream parameter to public APIs: - `cudf::strings::find()` - `cudf::strings::rfind()` - `cudf::strings::contains()` - `cudf::strings::starts_with()` - `cudf::strings::ends_with()` - `cudf::strings::findall()` - `cudf::strings::find_multiple()` Also cleaned up some of the doxygen comments. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14060 --- cpp/include/cudf/strings/find.hpp | 102 ++++++++++++--------- cpp/include/cudf/strings/find_multiple.hpp | 12 ++- cpp/include/cudf/strings/findall.hpp | 2 + cpp/src/strings/search/find.cu | 24 +++-- cpp/src/strings/search/find_multiple.cu | 7 +- cpp/src/strings/search/findall.cu | 3 +- cpp/tests/CMakeLists.txt | 5 +- cpp/tests/streams/strings/find_test.cpp | 49 ++++++++++ 8 files changed, 143 insertions(+), 61 deletions(-) create mode 100644 cpp/tests/streams/strings/find_test.cpp diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 2fed36862b9..c1aa8b294b3 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -43,19 +43,21 @@ namespace strings { * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First character position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First character position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search to the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr find( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -72,19 +74,21 @@ std::unique_ptr find( * * @throw cudf::logic_error if start position is greater than stop position. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param start First position to include in the search. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param start First position to include in the search * @param stop Last position (exclusive) to include in the search. * Default of -1 will search starting at the end of the string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values */ std::unique_ptr rfind( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -123,37 +127,41 @@ std::unique_ptr find( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a column of boolean values for each string where true indicates * the corresponding target string was found within that string in the provided column. * - * The 'output[i] = true` if string `targets[i]` is found inside `strings[i]` otherwise + * The 'output[i] = true` if string `targets[i]` is found inside `input[i]` otherwise * `output[i] = false`. * If `target[i]` is an empty string, true is returned for `output[i]`. * If `target[i]` is null, false is returned for `output[i]`. * - * Any null `strings[i]` row results in a null `output[i]` row. + * Any null string entries return corresponding null entries in the output columns. * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings column of targets to check row-wise in `strings`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings column of targets to check row-wise in `strings` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr contains( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -166,14 +174,16 @@ std::unique_ptr contains( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * @return New type_id::BOOL8 column. */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -190,14 +200,16 @@ std::unique_ptr starts_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr starts_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -210,14 +222,16 @@ std::unique_ptr starts_with( * * Any null string entries return corresponding null entries in the output columns. * - * @param strings Strings instance for this operation. - * @param target UTF-8 encoded string to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param target UTF-8 encoded string to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -234,14 +248,16 @@ std::unique_ptr ends_with( * * @throw cudf::logic_error if `strings.size() != targets.size()`. * - * @param strings Strings instance for this operation. - * @param targets Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New type_id::BOOL8 column. + * @param input Strings instance for this operation + * @param targets Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr ends_with( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 21cfdb15146..06b851c5012 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,14 +48,16 @@ namespace strings { * * @throw cudf::logic_error if `targets` is empty or contains nulls * - * @param input Strings instance for this operation. - * @param targets Strings to search for in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return Lists column with character position values. + * @param input Strings instance for this operation + * @param targets Strings to search for in each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Lists column with character position values */ std::unique_ptr find_multiple( strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 745f0fc19ff..379b9624dc6 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -57,12 +57,14 @@ struct regex_program; * * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New lists column of strings */ std::unique_ptr findall( strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 3de9dd34d83..1299e552565 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -305,20 +305,22 @@ std::unique_ptr find(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::find(strings, target, start, stop, stream, mr); } std::unique_ptr rfind(strings_column_view const& strings, string_scalar const& target, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr); + return detail::rfind(strings, target, start, stop, stream, mr); } std::unique_ptr find(strings_column_view const& input, @@ -618,50 +620,56 @@ std::unique_ptr ends_with(strings_column_view const& strings, std::unique_ptr contains(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, target, cudf::get_default_stream(), mr); + return detail::contains(strings, target, stream, mr); } std::unique_ptr contains(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, targets, cudf::get_default_stream(), mr); + return detail::contains(strings, targets, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, target, cudf::get_default_stream(), mr); + return detail::starts_with(strings, target, stream, mr); } std::unique_ptr starts_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, targets, cudf::get_default_stream(), mr); + return detail::starts_with(strings, targets, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, string_scalar const& target, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, target, cudf::get_default_stream(), mr); + return detail::ends_with(strings, target, stream, mr); } std::unique_ptr ends_with(strings_column_view const& strings, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, targets, cudf::get_default_stream(), mr); + return detail::ends_with(strings, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 4a823ad1dcb..fcaec835f4d 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -70,8 +70,8 @@ std::unique_ptr find_multiple(strings_column_view const& input, results->set_null_count(0); auto offsets = cudf::detail::sequence(strings_count + 1, - numeric_scalar(0), - numeric_scalar(targets_count), + numeric_scalar(0, true, stream), + numeric_scalar(targets_count, true, stream), stream, mr); return make_lists_column(strings_count, @@ -88,10 +88,11 @@ std::unique_ptr find_multiple(strings_column_view const& input, // external API std::unique_ptr find_multiple(strings_column_view const& input, strings_column_view const& targets, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find_multiple(input, targets, cudf::get_default_stream(), mr); + return detail::find_multiple(input, targets, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 2df64c6a0a7..acea4ff1c51 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -134,10 +134,11 @@ std::unique_ptr findall(strings_column_view const& input, std::unique_ptr findall(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall(input, prog, cudf::get_default_stream(), mr); + return detail::findall(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4923ef5c903..6414962903e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,7 +627,10 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing) +ConfigureTest( + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE + testing +) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp new file mode 100644 index 00000000000..b734a1738cc --- /dev/null +++ b/cpp/tests/streams/strings/find_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsFindTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsFindTest, Find) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::find(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::rfind(view, target, 0, -1, cudf::test::get_default_stream()); + cudf::strings::find(view, view, 0, cudf::test::get_default_stream()); + cudf::strings::find_multiple(view, view, cudf::test::get_default_stream()); + cudf::strings::contains(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, target, cudf::test::get_default_stream()); + cudf::strings::starts_with(view, view, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, target, cudf::test::get_default_stream()); + cudf::strings::ends_with(view, view, cudf::test::get_default_stream()); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); +} From 05ee2604d8f4e7c6525d12926100e2b11b6d6cb0 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:45:11 -0400 Subject: [PATCH 23/23] Fix kernel launch error for cudf::io::orc::gpu::rowgroup_char_counts_kernel (#14139) Fixes memcheck error found during the nightly builds found in gtest `OrcWriterNumericTypeTest/0.SingleColumn` ``` # compute-sanitizer --tool memcheck gtests/ORC_TEST --gtest_filter=OrcWriterNumericTypeTest/0.SingleColumn --rmm_mode=cuda ========= COMPUTE-SANITIZER Note: Google Test filter = OrcWriterNumericTypeTest/0.SingleColumn [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from OrcWriterNumericTypeTest/0, where TypeParam = signed char [ RUN ] OrcWriterNumericTypeTest/0.SingleColumn ========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaLaunchKernel. ========= Saved host backtrace up to driver entry point at error ========= Host Frame: [0x480aa6] ========= in /usr/lib/x86_64-linux-gnu/libcuda.so.1 ========= Host Frame:cudaLaunchKernel [0x6c358] ========= in /conda/envs/rapids/lib/libcudart.so.11.0 ========= Host Frame:__device_stub__ZN4cudf2io3orc3gpu27rowgroup_char_counts_kernelENS_6detail11base_2dspanIiNS_11device_spanEEENS5_IKNS1_22orc_column_device_viewELm18446744073709551615EEENS4_IKNS1_13rowgroup_rowsES5_EENS5_IKjLm18446744073709551615EEE(cudf::detail::base_2dspan&, cudf::device_span&, cudf::detail::base_2dspan&, cudf::device_span&) [0x14fccb4] ``` Adds a check to avoid the kernel launch if the number of strings column is zero. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14139 --- cpp/src/io/orc/dict_enc.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 0007530a5af..1d2262a1ccc 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -60,6 +60,7 @@ void rowgroup_char_counts(device_2dspan counts, auto const num_rowgroups = rowgroup_bounds.size().first; auto const num_str_cols = str_col_indexes.size(); + if (num_str_cols == 0) { return; } int block_size = 0; // suggested thread count to use int min_grid_size = 0; // minimum block count required