From 60009a8005a8b9b69c2c870465b5cf46532d3388 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 13 Sep 2023 17:12:44 -0500
Subject: [PATCH 01/23] Fix naming issues with `Index.to_frame` and
 `MultiIndex.to_frame` APIs (#14105)

This PR:

- [x] Introduces `allow_duplicates` for parity with `MultiIndex.to_frame` - however this parameter is non-functional since cudf doesn't support duplicate column names.
- [x] Fixed handling of duplicate index names in `MultiIndex.to_frame`
- [x] Added proper docs for `Index.to_frame` & `MultiIndex.to_frame` separately due to change in API signature.
- [x] Added tests for `Index.to_frame` & `MultiIndex.to_frame`
- [x] Introduced deprecations that will go away when pandas-2.0 support is enabled.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14105
---
 python/cudf/cudf/core/_base_index.py      | 57 +++++++++++--
 python/cudf/cudf/core/multiindex.py       | 99 ++++++++++++++++++++---
 python/cudf/cudf/tests/test_index.py      | 19 +++++
 python/cudf/cudf/tests/test_multiindex.py | 83 +++++++++++++++++++
 4 files changed, 242 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2f6e864b51c..c0bd9ec6eee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,6 +19,7 @@
     drop_nulls,
 )
 from cudf._lib.types import size_type_dtype
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     is_bool_dtype,
     is_integer,
@@ -701,21 +702,65 @@ def fillna(self, value, downcast=None):
 
         return super().fillna(value=value)
 
-    def to_frame(self, index=True, name=None):
+    def to_frame(self, index=True, name=no_default):
         """Create a DataFrame with a column containing this Index
 
         Parameters
         ----------
         index : boolean, default True
             Set the index of the returned DataFrame as the original Index
-        name : str, default None
-            Name to be used for the column
+        name : object, defaults to index.name
+            The passed name should substitute for the index name (if it has
+            one).
+
         Returns
         -------
         DataFrame
-            cudf DataFrame
-        """
-        if name is not None:
+            DataFrame containing the original Index data.
+
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+
+        By default, the original Index is reused. To enforce a new Index:
+
+        >>> idx.to_frame(index=False)
+            animal
+        0   Ant
+        1  Bear
+        2   Cow
+
+        To override the name of the resulting column, specify `name`:
+
+        >>> idx.to_frame(index=False, name='zoo')
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        if name is None:
+            warnings.warn(
+                "Explicitly passing `name=None` currently preserves "
+                "the Index's name or uses a default name of 0. This "
+                "behaviour is deprecated, and in the future `None` "
+                "will be used as the name of the "
+                "resulting DataFrame column.",
+                FutureWarning,
+            )
+            name = no_default
+        if name is not no_default:
             col_name = name
         elif self.name is None:
             col_name = 0
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index bc6726879c1..21380bb841c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -20,6 +20,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
+from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_150
@@ -1015,7 +1016,12 @@ def __getitem__(self, index):
         elif isinstance(index, slice):
             start, stop, step = index.indices(len(self))
             index = column.arange(start, stop, step)
-        result = MultiIndex.from_frame(self.to_frame(index=False).take(index))
+        result = MultiIndex.from_frame(
+            self.to_frame(index=False, name=range(0, self.nlevels)).take(
+                index
+            ),
+            names=self.names,
+        )
 
         # we are indexing into a single row of the MultiIndex,
         # return that row as a tuple:
@@ -1026,24 +1032,95 @@ def __getitem__(self, index):
             result._codes = self._codes.take(index)
         if self._levels is not None:
             result._levels = self._levels
-        result.names = self.names
         return result
 
     @_cudf_nvtx_annotate
-    def to_frame(self, index=True, name=None):
+    def to_frame(self, index=True, name=no_default, allow_duplicates=False):
+        """
+        Create a DataFrame with the levels of the MultiIndex as columns.
+
+        Column ordering is determined by the DataFrame constructor with data as
+        a dict.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Set the index of the returned DataFrame as the original MultiIndex.
+        name : list / sequence of str, optional
+            The passed names should substitute index level names.
+        allow_duplicates : bool, optional default False
+            Allow duplicate column labels to be created. Note
+            that this parameter is non-functional because
+            duplicates column labels aren't supported in cudf.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
+        >>> mi
+        MultiIndex([('a', 'c'),
+                    ('b', 'd')],
+                   )
+
+        >>> df = mi.to_frame()
+        >>> df
+             0  1
+        a c  a  c
+        b d  b  d
+
+        >>> df = mi.to_frame(index=False)
+        >>> df
+           0  1
+        0  a  c
+        1  b  d
+
+        >>> df = mi.to_frame(name=['x', 'y'])
+        >>> df
+             x  y
+        a c  a  c
+        b d  b  d
+        """
         # TODO: Currently this function makes a shallow copy, which is
         # incorrect. We want to make a deep copy, otherwise further
         # modifications of the resulting DataFrame will affect the MultiIndex.
-        df = cudf.DataFrame._from_data(data=self._data)
-        if index:
-            df = df.set_index(self)
-        if name is not None:
+        if name is None:
+            warnings.warn(
+                "Explicitly passing `name=None` currently preserves the "
+                "Index's name or uses a default name of 0. This behaviour "
+                "is deprecated, and in the future `None` will be used "
+                "as the name of the resulting DataFrame column.",
+                FutureWarning,
+            )
+            name = no_default
+
+        if name is not no_default:
             if len(name) != len(self.levels):
                 raise ValueError(
                     "'name' should have the same length as "
                     "number of levels on index."
                 )
-            df.columns = name
+            column_names = name
+        else:
+            column_names = self.names
+        all_none_names = None
+        if not (
+            all_none_names := all(x is None for x in column_names)
+        ) and len(column_names) != len(set(column_names)):
+            raise ValueError("Duplicate column names are not allowed")
+        df = cudf.DataFrame._from_data(
+            data=self._data,
+            columns=column_names
+            if name is not no_default and not all_none_names
+            else None,
+        )
+
+        if index:
+            df = df.set_index(self)
+
         return df
 
     @_cudf_nvtx_annotate
@@ -1504,7 +1581,9 @@ def droplevel(self, level=-1):
 
     @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False, **kwargs):
-        result = self.to_frame(index=False).to_pandas(nullable=nullable)
+        result = self.to_frame(
+            index=False, name=list(range(self.nlevels))
+        ).to_pandas(nullable=nullable)
         return pd.MultiIndex.from_frame(result, names=self.names)
 
     @classmethod
@@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self):
         Convert all na values(if any) in MultiIndex object
         to `<NA>` as a preprocessing step to `__repr__` methods.
         """
-        index_df = self.to_frame(index=False)
+        index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
         return MultiIndex.from_frame(
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 6fb615c22e0..b3791cddce3 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf
+from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype
 from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
 from cudf.core.index import (
@@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype):
     gidx = cudf.from_pandas(pidx)
 
     assert_eq(pidx, gidx)
+
+
+@pytest.mark.parametrize(
+    "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
+)
+@pytest.mark.parametrize("data_name", [None, 1, "abc"])
+@pytest.mark.parametrize("index", [True, False])
+@pytest.mark.parametrize("name", [None, no_default, 1, "abc"])
+def test_index_to_frame(data, data_name, index, name):
+    pidx = pd.Index(data, name=data_name)
+    gidx = cudf.from_pandas(pidx)
+
+    with expect_warning_if(name is None):
+        expected = pidx.to_frame(index=index, name=name)
+    with expect_warning_if(name is None):
+        actual = gidx.to_frame(index=index, name=name)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 3c843ace0a8..fb2b0c07efb 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -16,6 +16,7 @@
 import pytest
 
 import cudf
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_GE_200
 from cudf.core.column import as_column
 from cudf.core.index import as_index
@@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error():
     midx = cudf.MultiIndex.from_tuples([("a", "b")])
     with pytest.raises(NotImplementedError):
         midx.to_series()
+
+
+@pytest.mark.parametrize(
+    "pidx",
+    [
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            names=["a", "b", "c"],
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            names=["a", "a", "a"],
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]]
+)
+@pytest.mark.parametrize("allow_duplicates", [True, False])
+@pytest.mark.parametrize("index", [True, False])
+def test_multiindex_to_frame_allow_duplicates(
+    pidx, name, allow_duplicates, index
+):
+    gidx = cudf.from_pandas(pidx)
+
+    if (
+        (
+            len(pidx.names) != len(set(pidx.names))
+            and not all(x is None for x in pidx.names)
+        )
+        and not allow_duplicates
+        and (name is None or name is no_default)
+    ):
+        assert_exceptions_equal(
+            pidx.to_frame,
+            gidx.to_frame,
+            lfunc_args_and_kwargs=(
+                [],
+                {
+                    "index": index,
+                    "name": name,
+                    "allow_duplicates": allow_duplicates,
+                },
+            ),
+            rfunc_args_and_kwargs=(
+                [],
+                {
+                    "index": index,
+                    "name": name,
+                    "allow_duplicates": allow_duplicates,
+                },
+            ),
+        )
+    else:
+        if (
+            len(pidx.names) != len(set(pidx.names))
+            and not all(x is None for x in pidx.names)
+            and not isinstance(name, list)
+        ) or (isinstance(name, list) and len(name) != len(set(name))):
+            # cudf doesn't have the ability to construct dataframes
+            # with duplicate column names
+            with expect_warning_if(name is None):
+                with pytest.raises(ValueError):
+                    gidx.to_frame(
+                        index=index,
+                        name=name,
+                        allow_duplicates=allow_duplicates,
+                    )
+        else:
+            with expect_warning_if(name is None):
+                expected = pidx.to_frame(
+                    index=index, name=name, allow_duplicates=allow_duplicates
+                )
+            with expect_warning_if(name is None):
+                actual = gidx.to_frame(
+                    index=index, name=name, allow_duplicates=allow_duplicates
+                )
+
+            assert_eq(expected, actual)

From edfef800d98491ee61b390645548f9223bbfb049 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 13 Sep 2023 16:54:45 -0700
Subject: [PATCH 02/23] Refactor `hash_reduce_by_row` (#14095)

This PR extracts `hash_reduce_by_row` function from `distinct_reduce.*` files. Previously, that function was designed specifically to work with `distinct` in stream compaction with `size_type` output. Now, it becomes more generic and can support more generic reduction operations and various output types.

No new functionality was added.

The changes in this work pave the way for implementing histogram/merge histogram aggregations, which also rely on hash-base reduction.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14095
---
 cpp/CMakeLists.txt                            |   2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        | 167 ++++++++++++++++++
 cpp/src/stream_compaction/distinct.cu         |  28 +--
 cpp/src/stream_compaction/distinct_count.cu   |   4 +-
 cpp/src/stream_compaction/distinct_helpers.cu | 109 ++++++++++++
 ...stinct_reduce.cuh => distinct_helpers.hpp} |  12 +-
 cpp/src/stream_compaction/distinct_reduce.cu  | 150 ----------------
 .../stream_compaction_common.cuh              |  22 ---
 .../stream_compaction_common.hpp              |   5 -
 9 files changed, 299 insertions(+), 200 deletions(-)
 create mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh
 create mode 100644 cpp/src/stream_compaction/distinct_helpers.cu
 rename cpp/src/stream_compaction/{distinct_reduce.cuh => distinct_helpers.hpp} (92%)
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c37d05a21c7..900e9eed98e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,7 +530,7 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
-  src/stream_compaction/distinct_reduce.cu
+  src/stream_compaction/distinct_helpers.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
new file mode 100644
index 00000000000..2d2b43f1d4a
--- /dev/null
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/uninitialized_fill.h>
+
+#include <cuco/static_map.cuh>
+
+namespace cudf::detail {
+
+using hash_map_type =
+  cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
+/**
+ * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
+ * rows that compared equal.
+ *
+ * TODO: We need to switch to use `static_reduction_map` when it is ready
+ * (https://github.com/NVIDIA/cuCollections/pull/98).
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+struct reduce_by_row_fn_base {
+ protected:
+  MapView const d_map;
+  KeyHasher const d_hasher;
+  KeyEqual const d_equal;
+  OutputType* const d_output;
+
+  reduce_by_row_fn_base(MapView const& d_map,
+                        KeyHasher const& d_hasher,
+                        KeyEqual const& d_equal,
+                        OutputType* const d_output)
+    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
+  {
+  }
+
+  /**
+   * @brief Return a pointer to the output array at the given index.
+   *
+   * @param idx The access index
+   * @return A pointer to the given index in the output array
+   */
+  __device__ OutputType* get_output_ptr(size_type const idx) const
+  {
+    auto const iter = d_map.find(idx, d_hasher, d_equal);
+
+    if (iter != d_map.end()) {
+      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
+      // As such, looking up for all indices of duplicate rows always returns the same value.
+      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
+
+      // All duplicate rows will have concurrent access to this same output slot.
+      return &d_output[inserted_idx];
+    } else {
+      // All input `idx` values have been inserted into the map before.
+      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
+      // `d_equal(idx, idx) == false`.
+      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
+      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
+      // output slot.
+      return &d_output[idx];
+    }
+  }
+};
+
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `init` parameter. Then, the reduction result for each row group is written into the output
+ * array at the index of an unspecified row in the group.
+ *
+ * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
+ *         reduction functor derived from `reduce_by_row_fn_base`
+ * @tparam OutputType Type of the reduction results
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
+ * @param init The initial value for reduction of each row group
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
+template <typename ReduceFuncBuilder, typename OutputType>
+rmm::device_uvector<OutputType> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  ReduceFuncBuilder func_builder,
+  OutputType init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const map_dview  = map.get_device_view();
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
+
+  auto const reduce_by_row = [&](auto const value_comp) {
+    if (has_nested_columns) {
+      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    } else {
+      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    }
+  };
+
+  if (nans_equal == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    reduce_by_row(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    reduce_by_row(nan_unequal_comparator{});
+  }
+
+  return reduction_results;
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index cc60b2a12ea..cc1e3423d42 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.cuh"
+#include "distinct_helpers.hpp"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>
@@ -50,8 +50,8 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL},
-                           cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL},
+                           cuco::empty_key{-1},
+                           cuco::empty_value{std::numeric_limits<size_type>::min()},
                            detail::hash_table_allocator_type{default_allocator<char>{}, stream},
                            stream.value()};
 
@@ -61,7 +61,7 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
 
   auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
@@ -96,16 +96,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = hash_reduce_by_row(map,
-                                                    std::move(preprocessed_input),
-                                                    input.num_rows(),
-                                                    has_nulls,
-                                                    has_nested_columns,
-                                                    keep,
-                                                    nulls_equal,
-                                                    nans_equal,
-                                                    stream,
-                                                    rmm::mr::get_current_device_resource());
+  auto const reduction_results = reduce_by_row(map,
+                                               std::move(preprocessed_input),
+                                               input.num_rows(),
+                                               has_nulls,
+                                               has_nested_columns,
+                                               keep,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 4bca0827efe..ac4811ad279 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -136,14 +136,14 @@ cudf::size_type distinct_count(table_view const& keys,
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const hash_key   = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const hash_key   = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
     auto key_set      = cuco::experimental::static_set{
       cuco::experimental::extent{compute_hash_table_size(num_rows)},
-      cuco::empty_key<cudf::size_type>{COMPACTION_EMPTY_KEY_SENTINEL},
+      cuco::empty_key<cudf::size_type>{-1},
       row_equal,
       cuco::experimental::linear_probing<1, hasher_type>{hash_key},
       detail::hash_table_allocator_type{default_allocator<char>{}, stream},
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
new file mode 100644
index 00000000000..8f36ec98f4a
--- /dev/null
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distinct_helpers.hpp"
+
+#include <cudf/detail/hash_reduce_by_row.cuh>
+
+namespace cudf::detail {
+
+namespace {
+/**
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual>
+struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+  duplicate_keep_option const keep;
+
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            duplicate_keep_option const keep,
+            size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
+      keep{keep}
+  {
+  }
+
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const out_ptr = this->get_output_ptr(idx);
+
+    if (keep == duplicate_keep_option::KEEP_FIRST) {
+      // Store the smallest index of all rows that are equal.
+      atomicMin(out_ptr, idx);
+    } else if (keep == duplicate_keep_option::KEEP_LAST) {
+      // Store the greatest index of all rows that are equal.
+      atomicMax(out_ptr, idx);
+    } else {
+      // Count the number of rows in each group of rows that are compared equal.
+      atomicAdd(out_ptr, size_type{1});
+    }
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
+struct reduce_func_builder {
+  duplicate_keep_option const keep;
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
+}  // namespace
+
+// This function is split from `distinct.cu` to improve compile time.
+rmm::device_uvector<size_type> reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
+               "This function should not be called with KEEP_ANY");
+
+  return hash_reduce_by_row(map,
+                            preprocessed_input,
+                            num_rows,
+                            has_nulls,
+                            has_nested_columns,
+                            nulls_equal,
+                            nans_equal,
+                            reduce_func_builder{keep},
+                            reduction_init_value(keep),
+                            stream,
+                            mr);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_helpers.hpp
similarity index 92%
rename from cpp/src/stream_compaction/distinct_reduce.cuh
rename to cpp/src/stream_compaction/distinct_helpers.hpp
index 8ec1fa18205..b667d0b04f0 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -14,18 +14,14 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
 
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <memory>
 
 namespace cudf::detail {
 
@@ -56,6 +52,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
  *
+ * Note that this function is not needed when `keep == KEEP_NONE`.
+ *
  * At the beginning of the operation, the entire output array is filled with a value given by
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
@@ -68,11 +66,13 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
deleted file mode 100644
index 020e6a495bc..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distinct_reduce.cuh"
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-namespace {
-/**
- * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_by_row_fn {
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  duplicate_keep_option const keep;
-  size_type* const d_output;
-
-  reduce_by_row_fn(MapView const& d_map,
-                   KeyHasher const& d_hasher,
-                   KeyEqual const& d_equal,
-                   duplicate_keep_option const keep,
-                   size_type* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output}
-  {
-  }
-
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
-    }
-  }
-
- private:
-  __device__ size_type* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-}  // namespace
-
-rmm::device_uvector<size_type> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             reduction_results.begin(),
-                             reduction_results.end(),
-                             reduction_init_value(keep));
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 4779cd990fd..839672d6a56 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -29,28 +29,6 @@
 namespace cudf {
 namespace detail {
 
-namespace experimental {
-
-/**
- * @brief Device callable to hash a given row.
- */
-template <typename RowHash>
-class compaction_hash {
- public:
-  compaction_hash(RowHash row_hasher) : _hash{row_hasher} {}
-
-  __device__ inline auto operator()(size_type i) const noexcept
-  {
-    auto hash = _hash(i);
-    return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
-  }
-
- private:
-  RowHash _hash;
-};
-
-}  // namespace experimental
-
 /**
 ￼ * @brief Device functor to determine if a row is valid.
 ￼ */
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index 0cd2d8f4b14..58d958d2ff4 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -30,11 +30,6 @@
 namespace cudf {
 namespace detail {
 
-constexpr auto COMPACTION_EMPTY_KEY_SENTINEL   = std::numeric_limits<size_type>::max();
-constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits<size_type>::min();
-
-using hash_type = cuco::murmurhash3_32<size_type>;
-
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
 using hash_map_type =

From 664dfc33a29ddb86e671c19f12e2b56e32d46a8b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:21:57 -1000
Subject: [PATCH 03/23] Raise NotImplementedError in to_datetime if Z (or tz
 component) in string (#14074)

closes #14039
Avoids this discrepancy when a date string has a tz component

```python
In [1]: import pandas

In [2]: import cudf

In [3]: data = ["2019-01-01T00:00:00.000Z"]

In [4]: cudf.to_datetime(data)
Out[4]: DatetimeIndex(['2019-01-01'], dtype='datetime64[ns]')

In [5]: pandas.to_datetime(data)
Out[5]: DatetimeIndex(['2019-01-01 00:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14074
---
 python/cudf/cudf/core/column/datetime.py | 15 +++++---
 python/cudf/cudf/tests/test_datetime.py  | 49 +++++++++++-------------
 python/cudf/cudf/tests/test_string.py    | 12 +++---
 3 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index da6c4fb858c..7775723e267 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str:
     fmt = _guess_datetime_format(element, **kwargs)
 
     if fmt is not None:
+        if "%z" in fmt or "%Z" in fmt:
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
         return fmt
 
     element_parts = element.split(".")
@@ -651,11 +655,12 @@ def infer_format(element: str, **kwargs) -> str:
         raise ValueError("Unable to infer the timestamp format from the data")
 
     if len(second_parts) > 1:
-        # "Z" indicates Zulu time(widely used in aviation) - Which is
-        # UTC timezone that currently cudf only supports. Having any other
-        # unsupported timezone will let the code fail below
-        # with a ValueError.
-        second_parts.remove("Z")
+        # We may have a non-digit, timezone-like component
+        # like Z, UTC-3, +01:00
+        if any(re.search(r"\D", part) for part in second_parts):
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
         second_part = "".join(second_parts[1:])
 
         if len(second_part) > 1:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4c20258ae67..5cab19eedc6 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1250,40 +1250,31 @@ def test_datetime_reductions(data, op, dtype):
         assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("timezone", ["naive", "UTC"])
 @pytest.mark.parametrize(
     "data",
     [
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
-            timezone="UTC",
-        ),
+        np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
+        np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
     ],
 )
 @pytest.mark.parametrize("dtype", DATETIME_TYPES)
-def test_datetime_infer_format(data, dtype):
-    sr = cudf.Series(data)
-    psr = pd.Series(data)
+def test_datetime_infer_format(data, timezone, dtype):
+    ts_data = np.datetime_as_string(data, timezone=timezone)
+    sr = cudf.Series(ts_data)
+    if timezone == "naive":
+        psr = pd.Series(ts_data)
 
-    expected = psr.astype(dtype)
-    actual = sr.astype(dtype)
+        expected = psr.astype(dtype)
+        actual = sr.astype(dtype)
 
-    assert_eq(expected, actual)
+        assert_eq(expected, actual)
+    else:
+        with pytest.raises(NotImplementedError):
+            sr.astype(dtype)
 
 
 def test_dateoffset_instance_subclass_check():
@@ -2158,6 +2149,12 @@ def test_format_timezone_not_implemented(code):
         )
 
 
+@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"])
+def test_no_format_timezone_not_implemented(tz):
+    with pytest.raises(NotImplementedError):
+        cudf.to_datetime([f"2020-01-01 00:00:00{tz}"])
+
+
 @pytest.mark.parametrize("arg", [True, False])
 def test_args_not_datetime_typerror(arg):
     with pytest.raises(TypeError):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 2bddd93ccb8..d54027eb707 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -200,12 +200,12 @@ def test_string_astype(dtype):
         data = ["True", "False", "True", "False", "False"]
     elif dtype.startswith("datetime64"):
         data = [
-            "2019-06-04T00:00:00Z",
-            "2019-06-04T12:12:12Z",
-            "2019-06-03T00:00:00Z",
-            "2019-05-04T00:00:00Z",
-            "2018-06-04T00:00:00Z",
-            "1922-07-21T01:02:03Z",
+            "2019-06-04T00:00:00",
+            "2019-06-04T12:12:12",
+            "2019-06-03T00:00:00",
+            "2019-05-04T00:00:00",
+            "2018-06-04T00:00:00",
+            "1922-07-21T01:02:03",
         ]
     elif dtype == "str" or dtype == "object":
         data = ["ab", "cd", "ef", "gh", "ij"]

From 89557bb0efad2d32098ba86b78e4f4706e7fe88f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 13 Sep 2023 19:22:46 -0500
Subject: [PATCH 04/23] Allow `numeric_only=True` for reduction operations on
 numeric types (#14111)

Fixes: #14090
This PR allows passing `numeric_only=True` for reduction operation on numerical columns.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14111
---
 python/cudf/cudf/core/single_column_frame.py |  6 ++-
 python/cudf/cudf/tests/test_stats.py         | 44 ++++++++++----------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7c019f0722c..6a56ab8f3a5 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -49,9 +49,11 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only:
+        if numeric_only and not isinstance(
+            self._column, cudf.core.column.numerical_base.NumericalBaseColumn
+        ):
             raise NotImplementedError(
-                f"Series.{op} does not implement numeric_only"
+                f"Series.{op} does not implement numeric_only."
             )
         try:
             return getattr(self._column, op)(**kwargs)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 6478fbaad95..463cdb8a7f4 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -247,30 +247,37 @@ def test_misc_quantiles(data, q):
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_kurtosis_series(data, null_flag):
+@pytest.mark.parametrize("numeric_only", [False, True])
+def test_kurtosis_series(data, null_flag, numeric_only):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
         data.iloc[[0, 2]] = None
         pdata.iloc[[0, 2]] = None
 
-    got = data.kurtosis()
+    got = data.kurtosis(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurtosis()
+    expected = pdata.kurtosis(numeric_only=numeric_only)
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.kurt()
+    got = data.kurt(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurt()
+    expected = pdata.kurt(numeric_only=numeric_only)
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.kurt(numeric_only=False)
-    got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurt(numeric_only=False)
-    np.testing.assert_array_almost_equal(got, expected)
 
-    with pytest.raises(NotImplementedError):
-        data.kurt(numeric_only=True)
+@pytest.mark.parametrize("op", ["skew", "kurt"])
+def test_kurt_skew_error(op):
+    gs = cudf.Series(["ab", "cd"])
+    ps = gs.to_pandas()
+
+    with pytest.raises(FutureWarning):
+        assert_exceptions_equal(
+            getattr(gs, op),
+            getattr(ps, op),
+            lfunc_args_and_kwargs=([], {"numeric_only": True}),
+            rfunc_args_and_kwargs=([], {"numeric_only": True}),
+        )
 
 
 @pytest.mark.parametrize(
@@ -290,26 +297,19 @@ def test_kurtosis_series(data, null_flag):
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_skew_series(data, null_flag):
+@pytest.mark.parametrize("numeric_only", [False, True])
+def test_skew_series(data, null_flag, numeric_only):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
         data.iloc[[0, 2]] = None
         pdata.iloc[[0, 2]] = None
 
-    got = data.skew()
-    expected = pdata.skew()
+    got = data.skew(numeric_only=numeric_only)
+    expected = pdata.skew(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.skew(numeric_only=False)
-    expected = pdata.skew(numeric_only=False)
-    got = got if np.isscalar(got) else got.to_numpy()
-    np.testing.assert_array_almost_equal(got, expected)
-
-    with pytest.raises(NotImplementedError):
-        data.skew(numeric_only=True)
-
 
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])

From 1bfeee7575e137bc75741cb2caf015e55ecab2cd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:23:14 -1000
Subject: [PATCH 05/23] Raise NotImplementedError for datetime strings with UTC
 offset (#14070)

Avoids e.g. DatetimeIndex(["2022-07-22 00:00:00+02:00"]) from dropping the +02:00 since timezones are not supported

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14070
---
 python/cudf/cudf/core/column/column.py  | 18 ++++++++++++++++--
 python/cudf/cudf/tests/test_datetime.py |  6 ++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 59ab3569814..d2e2f11a12e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2519,11 +2519,11 @@ def _construct_array(
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
-        inferred_dtype = None
+        inferred_dtype = infer_dtype(arbitrary, skipna=False)
         if (
             dtype is None
             and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and (inferred_dtype := infer_dtype(arbitrary, skipna=False))
+            and inferred_dtype
             in (
                 "mixed",
                 "mixed-integer",
@@ -2533,6 +2533,20 @@ def _construct_array(
         if inferred_dtype == "interval":
             # Only way to construct an Interval column.
             return pd.array(arbitrary)
+        elif (
+            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
+        ):
+            # We may have date-like strings with timezones
+            try:
+                pd_arbitrary = pd.to_datetime(arbitrary)
+                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
+                    raise NotImplementedError(
+                        "cuDF does not yet support timezone-aware datetimes"
+                    )
+            except pd.errors.OutOfBoundsDatetime:
+                # https://github.com/pandas-dev/pandas/issues/55096
+                pass
+
         arbitrary = np.asarray(
             arbitrary,
             dtype=native_dtype
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 5cab19eedc6..0cc7112454c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2141,6 +2141,12 @@ def test_daterange_pandas_compatibility():
     assert_eq(expected, actual)
 
 
+def test_strings_with_utc_offset_not_implemented():
+    with pytest.warns(DeprecationWarning, match="parsing timezone"):  # cupy
+        with pytest.raises(NotImplementedError):
+            DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+
+
 @pytest.mark.parametrize("code", ["z", "Z"])
 def test_format_timezone_not_implemented(code):
     with pytest.raises(NotImplementedError):

From 3b691f4be744ff1155df3634cd334211e738e37d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 15 Sep 2023 10:03:52 -1000
Subject: [PATCH 06/23] Raise NotImplementedError in to_datetime with dayfirst
 without infer_format (#14058)

Raises a `NotImplementedError` to avoid this incorrect behavior (which seems to actually not be implemented)

```python
In [6]: cudf.to_datetime(["10-02-2014"], dayfirst=True)
Out[6]: DatetimeIndex(['2014-10-02'], dtype='datetime64[ns]')
```

closes https://github.com/rapidsai/cudf/issues/14042

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14058
---
 python/cudf/cudf/core/tools/datetimes.py | 11 +++----
 python/cudf/cudf/tests/test_datetime.py  | 38 +++++++++++++++++++-----
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index f736e055163..a3f4bacf206 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 format=format,
             )
         else:
-            if infer_datetime_format and format is None:
+            if format is None:
+                if not infer_datetime_format and dayfirst:
+                    raise NotImplementedError(
+                        f"{dayfirst=} not implemented "
+                        f"when {format=} and {infer_datetime_format=}."
+                    )
                 format = column.datetime.infer_format(
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            elif format is None:
-                format = column.datetime.infer_format(
-                    element=col.element_indexing(0)
-                )
             return col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 0cc7112454c..164856ed6f5 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -617,22 +617,44 @@ def test_datetime_dataframe():
 @pytest.mark.parametrize("infer_datetime_format", [True, False])
 def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
     pd_data = data
+    is_string_data = False
     if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
         gd_data = cudf.from_pandas(pd_data)
+        is_string_data = (
+            gd_data.ndim == 1
+            and not gd_data.empty
+            and gd_data.dtype.kind == "O"
+        )
     else:
         if type(pd_data).__module__ == np.__name__:
             gd_data = cp.array(pd_data)
         else:
             gd_data = pd_data
+            is_string_data = isinstance(gd_data, list) and isinstance(
+                next(iter(gd_data), None), str
+            )
 
-    expected = pd.to_datetime(
-        pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format
-    )
-    actual = cudf.to_datetime(
-        gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format
-    )
-
-    assert_eq(actual, expected)
+    if dayfirst and not infer_datetime_format and is_string_data:
+        # Note: pandas<2.0 also does not respect dayfirst=True correctly
+        # for object data
+        with pytest.raises(NotImplementedError):
+            cudf.to_datetime(
+                gd_data,
+                dayfirst=dayfirst,
+                infer_datetime_format=infer_datetime_format,
+            )
+    else:
+        expected = pd.to_datetime(
+            pd_data,
+            dayfirst=dayfirst,
+            infer_datetime_format=infer_datetime_format,
+        )
+        actual = cudf.to_datetime(
+            gd_data,
+            dayfirst=dayfirst,
+            infer_datetime_format=infer_datetime_format,
+        )
+        assert_eq(actual, expected)
 
 
 @pytest.mark.parametrize(

From 4ca568e764a3898bf619a221cdb91a9261df22bf Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 18 Sep 2023 09:00:39 -0500
Subject: [PATCH 07/23] Update pyarrow-related dispatch logic in dask_cudf
 (#14069)

Updates `dask_cudf` dispatch logic to avoid breakage from https://github.com/dask/dask/pull/10500.
Also removes stale `try`/`except` logic.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14069
---
 python/dask_cudf/dask_cudf/backends.py        | 69 +++++++++----------
 .../dask_cudf/tests/test_dispatch.py          | 21 ++++--
 2 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2470b4d50f1..e3f4f04eb85 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -20,11 +20,14 @@
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
     concat_dispatch,
+    from_pyarrow_table_dispatch,
     group_split_dispatch,
     grouper_dispatch,
     hash_object_dispatch,
     is_categorical_dtype_dispatch,
     make_meta_dispatch,
+    pyarrow_schema_dispatch,
+    to_pyarrow_table_dispatch,
     tolist_dispatch,
     union_categoricals_dispatch,
 )
@@ -317,16 +320,6 @@ def get_grouper_cudf(obj):
     return cudf.core.groupby.Grouper
 
 
-try:
-    from dask.dataframe.dispatch import pyarrow_schema_dispatch
-
-    @pyarrow_schema_dispatch.register((cudf.DataFrame,))
-    def get_pyarrow_schema_cudf(obj):
-        return obj.to_arrow().schema
-
-except ImportError:
-    pass
-
 try:
     try:
         from dask.array.dispatch import percentile_lookup
@@ -378,35 +371,37 @@ def percentile_cudf(a, q, interpolation="linear"):
 except ImportError:
     pass
 
-try:
-    # Requires dask>2023.6.0
-    from dask.dataframe.dispatch import (
-        from_pyarrow_table_dispatch,
-        to_pyarrow_table_dispatch,
-    )
 
-    @to_pyarrow_table_dispatch.register(cudf.DataFrame)
-    def _cudf_to_table(obj, preserve_index=True, **kwargs):
-        if kwargs:
-            warnings.warn(
-                "Ignoring the following arguments to "
-                f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
-            )
-        return obj.to_arrow(preserve_index=preserve_index)
-
-    @from_pyarrow_table_dispatch.register(cudf.DataFrame)
-    def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
-        # cudf ignores self_destruct.
-        kwargs.pop("self_destruct", None)
-        if kwargs:
-            warnings.warn(
-                f"Ignoring the following arguments to "
-                f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
-            )
-        return obj.from_arrow(table)
+@pyarrow_schema_dispatch.register((cudf.DataFrame,))
+def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs):
+    if kwargs:
+        warnings.warn(
+            "Ignoring the following arguments to "
+            f"`pyarrow_schema_dispatch`: {list(kwargs)}"
+        )
+    return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema
 
-except ImportError:
-    pass
+
+@to_pyarrow_table_dispatch.register(cudf.DataFrame)
+def _cudf_to_table(obj, preserve_index=True, **kwargs):
+    if kwargs:
+        warnings.warn(
+            "Ignoring the following arguments to "
+            f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
+        )
+    return obj.to_arrow(preserve_index=preserve_index)
+
+
+@from_pyarrow_table_dispatch.register(cudf.DataFrame)
+def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
+    # cudf ignores self_destruct.
+    kwargs.pop("self_destruct", None)
+    if kwargs:
+        warnings.warn(
+            f"Ignoring the following arguments to "
+            f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
+        )
+    return obj.from_arrow(table)
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 22cc0f161e2..cf49b1df4f4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -3,9 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from packaging import version
 
-import dask
 from dask.base import tokenize
 from dask.dataframe import assert_eq
 from dask.dataframe.methods import is_categorical_dtype
@@ -24,10 +22,6 @@ def test_is_categorical_dispatch():
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
 
 
-@pytest.mark.skipif(
-    version.parse(dask.__version__) <= version.parse("2023.6.0"),
-    reason="Pyarrow-conversion dispatch requires dask>2023.6.0",
-)
 def test_pyarrow_conversion_dispatch():
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
@@ -79,3 +73,18 @@ def test_deterministic_tokenize(index):
     df2 = df.set_index(["B", "C"], drop=False)
     assert tokenize(df) != tokenize(df2)
     assert tokenize(df2) == tokenize(df2)
+
+
+@pytest.mark.parametrize("preserve_index", [True, False])
+def test_pyarrow_schema_dispatch(preserve_index):
+    from dask.dataframe.dispatch import (
+        pyarrow_schema_dispatch,
+        to_pyarrow_table_dispatch,
+    )
+
+    df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df["d"] = cudf.Series(["cat", "dog"] * 5)
+    table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index)
+    schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index)
+
+    assert schema.equals(table.schema)

From 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 09:53:18 -0500
Subject: [PATCH 08/23] Drop `kwargs` from `Series.count` (#14106)

Fixes: #14089
This PR drops `kwargs` from `Series.count` method signature.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/14106
---
 python/cudf/cudf/core/series.py       | 2 +-
 python/cudf/cudf/tests/test_series.py | 6 ++++++
 python/dask_cudf/dask_cudf/core.py    | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f44a3123dd3..7692d3015f8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2549,7 +2549,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     # Stats
     #
     @_cudf_nvtx_annotate
-    def count(self, level=None, **kwargs):
+    def count(self, level=None):
         """
         Return number of non-NA/null observations in the Series
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 798809b0ada..b1e991106ee 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2311,3 +2311,9 @@ def test_series_round_builtin(data, digits):
     actual = round(gs, digits)
 
     assert_eq(expected, actual)
+
+
+def test_series_count_invalid_param():
+    s = cudf.Series([])
+    with pytest.raises(TypeError):
+        s.count(skipna=True)
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index d2858876fcd..5b37e6e825c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -421,7 +421,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
-            n = x.count(skipna=skipna)
+            n = x.count()
             avg = x.mean(skipna=skipna)
         else:
             # Not skipping nulls, so might as well

From 8e081c015417c5a8d2a99f9db6bbc9a2c438e477 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 12:51:08 -0500
Subject: [PATCH 09/23] Add support for nested dict in `DataFrame` constructor
 (#14119)

Fixes: #14096

This PR enables nested dict initialization support in `DataFrame` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14119
---
 python/cudf/cudf/core/dataframe.py       |  4 ++--
 python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5a3d25a08a7..4fc175512a0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -977,7 +977,7 @@ def _align_input_series_indices(data, index):
         input_series = [
             Series(val)
             for val in data.values()
-            if isinstance(val, (pd.Series, Series))
+            if isinstance(val, (pd.Series, Series, dict))
         ]
 
         if input_series:
@@ -994,7 +994,7 @@ def _align_input_series_indices(data, index):
                 index = aligned_input_series[0].index
 
             for name, val in data.items():
-                if isinstance(val, (pd.Series, Series)):
+                if isinstance(val, (pd.Series, Series, dict)):
                     data[name] = aligned_input_series.pop(0)
 
         return data, index
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 61372bab3ad..652bdbbee45 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10349,3 +10349,22 @@ def test_dataframe_round_builtin(digits):
     actual = round(gdf, digits)
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_init_from_nested_dict():
+    ordered_dict = OrderedDict(
+        [
+            ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])),
+            ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])),
+            ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])),
+        ]
+    )
+    pdf = pd.DataFrame(ordered_dict)
+    gdf = cudf.DataFrame(ordered_dict)
+
+    assert_eq(pdf, gdf)
+    regular_dict = {key: dict(value) for key, value in ordered_dict.items()}
+
+    pdf = pd.DataFrame(regular_dict)
+    gdf = cudf.DataFrame(regular_dict)
+    assert_eq(pdf, gdf)

From 4467066c952111c0131383784d3eb6bf3248f0ac Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 12:51:53 -0500
Subject: [PATCH 10/23] Restrict iterables of `DataFrame`'s as input to
 `DataFrame` constructor (#14118)

Fixes: #14094
This PR raises an error when an iterates of `DataFrame`'s is detected in `DataFrame` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14118
---
 python/cudf/cudf/core/dataframe.py       | 11 ++++++-----
 python/cudf/cudf/tests/test_dataframe.py |  6 ++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4fc175512a0..84c16b71997 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -852,12 +852,13 @@ def _init_from_list_like(self, data, index=None, columns=None):
         elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
+        elif any(
+            not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data
+        ):
+            raise TypeError("Inputs should be an iterable or sequence.")
+        elif len(data) > 0 and not can_convert_to_column(data[0]):
+            raise ValueError("Must pass 2-d input.")
         else:
-            if any(
-                not isinstance(col, (abc.Iterable, abc.Sequence))
-                for col in data
-            ):
-                raise TypeError("Inputs should be an iterable or sequence.")
             if (
                 len(data) > 0
                 and columns is None
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 652bdbbee45..cbef9bfa2d8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10260,6 +10260,12 @@ def __getitem__(self, key):
         cudf.DataFrame({"a": A()})
 
 
+def test_dataframe_constructor_dataframe_list():
+    df = cudf.DataFrame(range(2))
+    with pytest.raises(ValueError):
+        cudf.DataFrame([df])
+
+
 def test_dataframe_constructor_from_namedtuple():
     Point1 = namedtuple("Point1", ["a", "b", "c"])
     Point2 = namedtuple("Point1", ["x", "y"])

From 2acd3dfa9e859feb4d803d9446c89b80f10bd54a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 18 Sep 2023 14:10:14 -0700
Subject: [PATCH 11/23] Expand statistics support in ORC writer (#13848)

Closes #7087, closes #13793, closes #13899

This PR adds support for several cases and statistics types:
- sum statistics are included even when all elements are null (no minmax);
- sum statistics are included in double stats;
- minimum/maximum and minimumNanos/maximumNanos are included in timestamp stats;
- hasNull field is written for all columns.
- decimal statistics

Added tests for all supported stats.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13848
---
 cpp/include/cudf/io/orc_metadata.hpp          |  10 +-
 .../detail/convert/fixed_point_to_string.cuh  |  80 +++++++++
 cpp/src/io/orc/orc.cpp                        |   4 +-
 cpp/src/io/orc/stats_enc.cu                   | 169 +++++++++++++-----
 cpp/src/io/parquet/page_enc.cu                |   4 +-
 .../statistics_type_identification.cuh        |  19 +-
 .../io/statistics/typed_statistics_chunk.cuh  |   2 +-
 .../strings/convert/convert_fixed_point.cu    |  54 +-----
 cpp/tests/io/orc_test.cpp                     | 109 +++++++++--
 python/cudf/cudf/tests/test_orc.py            |  60 ++++---
 10 files changed, 356 insertions(+), 155 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 623ee2e49fc..82d59803c25 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -111,10 +111,10 @@ struct string_statistics : minmax_statistics<std::string>, sum_statistics<int64_
 /**
  * @brief Statistics for boolean columns.
  *
- * The `count` array includes the count of `false` and `true` values.
+ * The `count` array contains the count of `true` values.
  */
 struct bucket_statistics {
-  std::vector<uint64_t> count;  ///< Count of `false` and `true` values
+  std::vector<uint64_t> count;  ///< count of `true` values
 };
 
 /**
@@ -141,8 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;  ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;  ///< maximum in milliseconds
+  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
+  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
new file mode 100644
index 00000000000..0ee26ec9ee2
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+
+namespace cudf::strings::detail {
+
+/**
+ * @brief Returns the number of digits in the given fixed point number.
+ *
+ * @param value The value of the fixed point number
+ * @param scale The scale of the fixed point number
+ * @return int32_t The number of digits required to represent the fixed point number
+ */
+__device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32_t scale)
+{
+  if (scale >= 0) return count_digits(value) + scale;
+
+  auto const abs_value = numeric::detail::abs(value);
+  auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
+  auto const fraction  = count_digits(abs_value % exp_ten);
+  auto const num_zeros = std::max(0, (-scale - fraction));
+  return static_cast<int32_t>(value < 0) +    // sign if negative
+         count_digits(abs_value / exp_ten) +  // integer
+         1 +                                  // decimal point
+         num_zeros +                          // zeros padding
+         fraction;                            // size of fraction
+}
+
+/**
+ * @brief Converts the given fixed point number to a string.
+ *
+ * Caller is responsible for ensuring that the output buffer is large enough. The required output
+ * buffer size can be obtained by calling `fixed_point_string_size`.
+ *
+ * @param value The value of the fixed point number
+ * @param scale The scale of the fixed point number
+ * @param out_ptr The pointer to the output string
+ */
+__device__ inline void fixed_point_to_string(__int128_t const& value, int32_t scale, char* out_ptr)
+{
+  if (scale >= 0) {
+    out_ptr += integer_to_string(value, out_ptr);
+    thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; });  // add zeros
+    return;
+  }
+
+  // scale < 0
+  // write format:   [-]integer.fraction
+  // where integer  = abs(value) / (10^abs(scale))
+  //       fraction = abs(value) % (10^abs(scale))
+  if (value < 0) *out_ptr++ = '-';  // add sign
+  auto const abs_value = numeric::detail::abs(value);
+  auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
+  auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+
+  out_ptr += integer_to_string(abs_value / exp_ten, out_ptr);  // add the integer part
+  *out_ptr++ = '.';                                            // add decimal point
+
+  thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; });  // add zeros
+  out_ptr += num_zeros;
+
+  integer_to_string(abs_value % exp_ten, out_ptr);  // add the fraction part
+}
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index fc50b7118be..bc399b75ef9 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -178,7 +178,9 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
   auto op = std::tuple(field_reader(1, s.minimum),
                        field_reader(2, s.maximum),
                        field_reader(3, s.minimum_utc),
-                       field_reader(4, s.maximum_utc));
+                       field_reader(4, s.maximum_utc),
+                       field_reader(5, s.minimum_nanos),
+                       field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
 }
 
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 069841980c1..69d7ec95acd 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -16,15 +16,16 @@
 
 #include "orc_gpu.hpp"
 
-#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
 
+#include <cudf/io/orc_types.hpp>
+#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::gpu {
+
+using strings::detail::fixed_point_string_size;
 
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
@@ -58,13 +59,14 @@ __global__ void __launch_bounds__(init_threads_per_block)
 constexpr unsigned int buffersize_reduction_dim = 32;
 constexpr unsigned int block_size        = buffersize_reduction_dim * buffersize_reduction_dim;
 constexpr unsigned int pb_fld_hdrlen     = 1;
-constexpr unsigned int pb_fld_hdrlen16   = 2;  // > 127-byte length
-constexpr unsigned int pb_fld_hdrlen32   = 5;  // > 16KB length
+constexpr unsigned int pb_fld_hdrlen32   = 5;
+constexpr unsigned int pb_fldlen_int32   = 5;
 constexpr unsigned int pb_fldlen_int64   = 10;
 constexpr unsigned int pb_fldlen_float64 = 8;
-constexpr unsigned int pb_fldlen_decimal = 40;  // Assume decimal2string fits in 40 characters
 constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64;
-constexpr unsigned int pb_fldlen_common  = 2 * pb_fld_hdrlen + pb_fldlen_int64;
+// statistics field number + number of values + has null
+constexpr unsigned int pb_fldlen_common =
+  pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen;
 
 template <unsigned int block_size>
 __global__ void __launch_bounds__(block_size, 1)
@@ -87,21 +89,32 @@ __global__ void __launch_bounds__(block_size, 1)
         case dtype_int8:
         case dtype_int16:
         case dtype_int32:
-        case dtype_date32:
         case dtype_int64:
-        case dtype_timestamp64:
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
+        case dtype_date32:
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
+          break;
+        case dtype_timestamp64:
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
+                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          break;
         case dtype_float32:
         case dtype_float64:
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_float64);
           break;
         case dtype_decimal64:
-        case dtype_decimal128:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fldlen_decimal);
-          break;
+        case dtype_decimal128: {
+          auto const scale    = groups[idx].col_dtype.scale();
+          auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale);
+          auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale);
+          auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale);
+          // common + total field length + encoded string lengths + strings
+          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) +
+                      min_size + max_size + sum_size;
+        } break;
         case dtype_string:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fldlen_int64) +
+          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) +
                       chunks[idx].min_value.str_val.length + chunks[idx].max_value.str_val.length;
           break;
         case dtype_none: stats_len = pb_fldlen_common;
@@ -126,9 +139,6 @@ struct stats_state_s {
   statistics_chunk chunk;
   statistics_merge_group group;
   statistics_dtype stats_dtype;  //!< Statistics data type for this column
-  // ORC stats
-  uint64_t numberOfValues;
-  uint8_t hasNull;
 };
 
 /*
@@ -178,6 +188,15 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, void const* by
   return p + len;
 }
 
+__device__ inline uint8_t* pb_put_decimal(
+  uint8_t* p, uint32_t id, __int128_t value, int32_t scale, int32_t len)
+{
+  p[0] = id * 8 + ProtofType::FIXEDLEN;
+  p    = pb_encode_uint(p + 1, len);
+  strings::detail::fixed_point_to_string(value, scale, reinterpret_cast<char*>(p));
+  return p + len;
+}
+
 // Protobuf field encoding for 64-bit raw encoding (double)
 __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* raw64)
 {
@@ -186,6 +205,15 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r
   return p + 9;
 }
 
+// Splits a nanosecond timestamp into milliseconds and nanoseconds
+__device__ std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
+{
+  auto const ns           = cuda::std::chrono::nanoseconds(nano_count);
+  auto const ms_floor     = cuda::std::chrono::floor<cuda::std::chrono::milliseconds>(ns);
+  auto const ns_remainder = ns - ms_floor;
+  return {ms_floor.count(), ns_remainder.count()};
+}
+
 /**
  * @brief Encode statistics in ORC protobuf format
  *
@@ -228,12 +256,14 @@ __global__ void __launch_bounds__(encode_threads_per_block)
 
   // Encode and update actual bfr size
   if (idx < statistics_count && t == 0) {
-    s->chunk           = chunks[idx];
-    s->group           = groups[idx];
-    s->stats_dtype     = s->group.stats_dtype;
-    s->base            = blob_bfr + s->group.start_chunk;
-    s->end             = blob_bfr + s->group.start_chunk + s->group.num_chunks;
-    uint8_t* cur       = pb_put_uint(s->base, 1, s->chunk.non_nulls);
+    s->chunk       = chunks[idx];
+    s->group       = groups[idx];
+    s->stats_dtype = s->group.stats_dtype;
+    s->base        = blob_bfr + s->group.start_chunk;
+    s->end         = blob_bfr + s->group.start_chunk + s->group.num_chunks;
+    uint8_t* cur   = pb_put_uint(s->base, 1, s->chunk.non_nulls);
+    cur            = pb_put_uint(cur, 10, s->chunk.null_count != 0);  // hasNull (bool)
+
     uint8_t* fld_start = cur;
     switch (s->stats_dtype) {
       case dtype_int8:
@@ -265,11 +295,14 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional double maximum = 2;
         //  optional double sum = 3;
         // }
-        if (s->chunk.has_minmax) {
+        if (s->chunk.has_minmax || s->chunk.has_sum) {
           *cur = 3 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
-          cur          = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val);
-          cur          = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val);
+          if (s->chunk.has_minmax) {
+            cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val);
+            cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val);
+          }
+          if (s->chunk.has_sum) { cur = pb_put_fixed64(cur, 3, &s->chunk.sum.fp_val); }
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
@@ -280,18 +313,25 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional string maximum = 2;
         //  optional sint64 sum = 3; // sum will store the total length of all strings
         // }
-        if (s->chunk.has_minmax && s->chunk.has_sum) {
-          uint32_t sz = (pb_put_int(cur, 3, s->chunk.sum.i_val) - cur) +
-                        (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) +
-                        (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) +
-                        s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length;
+        if (s->chunk.has_minmax || s->chunk.has_sum) {
+          uint32_t sz = 0;
+          if (s->chunk.has_minmax) {
+            sz += (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) +
+                  (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) +
+                  s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length;
+          }
+          if (s->chunk.has_sum) { sz += pb_put_int(cur, 3, s->chunk.sum.i_val) - cur; }
+
           cur[0] = 4 * 8 + ProtofType::FIXEDLEN;
           cur    = pb_encode_uint(cur + 1, sz);
-          cur    = pb_put_binary(
-            cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length);
-          cur = pb_put_binary(
-            cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length);
-          cur = pb_put_int(cur, 3, s->chunk.sum.i_val);
+
+          if (s->chunk.has_minmax) {
+            cur = pb_put_binary(
+              cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length);
+            cur = pb_put_binary(
+              cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length);
+          }
+          if (s->chunk.has_sum) { cur = pb_put_int(cur, 3, s->chunk.sum.i_val); }
         }
         break;
       case dtype_bool:
@@ -299,8 +339,9 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         // message BucketStatistics {
         //  repeated uint64 count = 1 [packed=true];
         // }
-        if (s->chunk.has_sum) {  // Sum is equal to the number of 'true' values
-          cur[0]       = 5 * 8 + ProtofType::FIXEDLEN;
+        if (s->chunk.has_sum) {
+          cur[0] = 5 * 8 + ProtofType::FIXEDLEN;
+          // count is equal to the number of 'true' values, despite what specs say
           cur          = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val);
           fld_start[1] = cur - (fld_start + 2);
         }
@@ -313,8 +354,33 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional string maximum = 2;
         //  optional string sum = 3;
         // }
-        if (s->chunk.has_minmax) {
-          // TODO: Decimal support (decimal min/max stored as strings)
+        if (s->chunk.has_minmax or s->chunk.has_sum) {
+          auto const scale = s->group.col_dtype.scale();
+
+          uint32_t sz = 0;
+          auto const min_size =
+            s->chunk.has_minmax ? fixed_point_string_size(s->chunk.min_value.d128_val, scale) : 0;
+          auto const max_size =
+            s->chunk.has_minmax ? fixed_point_string_size(s->chunk.max_value.d128_val, scale) : 0;
+          if (s->chunk.has_minmax) {
+            // encoded string lengths, plus the strings
+            sz += (pb_put_uint(cur, 1, min_size) - cur) + min_size +
+                  (pb_put_uint(cur, 1, max_size) - cur) + max_size;
+          }
+          auto const sum_size =
+            s->chunk.has_sum ? fixed_point_string_size(s->chunk.sum.d128_val, scale) : 0;
+          if (s->chunk.has_sum) { sz += (pb_put_uint(cur, 1, sum_size) - cur) + sum_size; }
+
+          cur[0] = 6 * 8 + ProtofType::FIXEDLEN;
+          cur    = pb_encode_uint(cur + 1, sz);
+
+          if (s->chunk.has_minmax) {
+            cur = pb_put_decimal(cur, 1, s->chunk.min_value.d128_val, scale, min_size);  //  minimum
+            cur = pb_put_decimal(cur, 2, s->chunk.max_value.d128_val, scale, max_size);  // maximum
+          }
+          if (s->chunk.has_sum) {
+            cur = pb_put_decimal(cur, 3, s->chunk.sum.d128_val, scale, sum_size);  // sum
+          }
         }
         break;
       case dtype_date32:
@@ -338,12 +404,24 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 maximum = 2;
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
+        //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
+        //  precision optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
-          cur          = pb_put_int(cur, 3, s->chunk.min_value.i_val);  // minimumUtc
-          cur          = pb_put_int(cur, 4, s->chunk.max_value.i_val);  // maximumUtc
+          auto const [min_ms, min_ns_remainder] =
+            split_nanosecond_timestamp(s->chunk.min_value.i_val);
+          auto const [max_ms, max_ns_remainder] =
+            split_nanosecond_timestamp(s->chunk.max_value.i_val);
+
+          // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
+          cur          = pb_put_int(cur, 1, min_ms);            // minimum
+          cur          = pb_put_int(cur, 2, max_ms);            // maximum
+          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
+          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
+          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
+          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
@@ -403,7 +481,4 @@ void orc_encode_statistics(uint8_t* blob_bfr,
     blob_bfr, groups, chunks, statistics_count);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::gpu
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 0af561be8da..fe0dbb85124 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1858,8 +1858,8 @@ __device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* s
     }
     case dtype_int64:
     case dtype_timestamp64:
-    case dtype_float64:
-    case dtype_decimal64: return {stats_val, sizeof(int64_t)};
+    case dtype_float64: return {stats_val, sizeof(int64_t)};
+    case dtype_decimal64:
     case dtype_decimal128:
       byte_reverse128(stats_val->d128_val, scratch);
       return {scratch, sizeof(__int128_t)};
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 32931d7d34d..ea8c71f0dcb 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -49,15 +49,15 @@ enum class is_int96_timestamp { YES, NO };
 template <io_file_format IO, is_int96_timestamp INT96>
 struct conversion_map;
 
-// Every timestamp or duration type is converted to milliseconds in ORC statistics
+// Every timestamp or duration type is converted to nanoseconds in ORC statistics
 template <is_int96_timestamp INT96>
 struct conversion_map<io_file_format::ORC, INT96> {
-  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
-                           std::pair<cudf::timestamp_us, cudf::timestamp_ms>,
-                           std::pair<cudf::timestamp_ns, cudf::timestamp_ms>,
-                           std::pair<cudf::duration_s, cudf::duration_ms>,
-                           std::pair<cudf::duration_us, cudf::duration_ms>,
-                           std::pair<cudf::duration_ns, cudf::duration_ms>>;
+  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ns>,
+                           std::pair<cudf::timestamp_us, cudf::timestamp_ns>,
+                           std::pair<cudf::timestamp_ns, cudf::timestamp_ns>,
+                           std::pair<cudf::duration_s, cudf::duration_ns>,
+                           std::pair<cudf::duration_us, cudf::duration_ns>,
+                           std::pair<cudf::duration_ns, cudf::duration_ns>>;
 };
 
 // In Parquet timestamps and durations with second resolution are converted to
@@ -125,7 +125,7 @@ class extrema_type {
 
   using non_arithmetic_extrema_type = typename std::conditional_t<
     cudf::is_fixed_point<T>() or cudf::is_duration<T>() or cudf::is_timestamp<T>(),
-    typename std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, int64_t>,
+    typename std::conditional_t<cudf::is_fixed_point<T>(), __int128_t, int64_t>,
     typename std::conditional_t<
       std::is_same_v<T, string_view>,
       string_view,
@@ -134,8 +134,7 @@ class extrema_type {
   // unsigned int/bool -> uint64_t
   // signed int        -> int64_t
   // float/double      -> double
-  // decimal32/64      -> int64_t
-  // decimal128        -> __int128_t
+  // decimal32/64/128  -> __int128_t
   // duration_[T]      -> int64_t
   // string_view       -> string_view
   // byte_array_view   -> byte_array_view
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index d007209a12a..e6ec1471cb7 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -244,9 +244,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
   stat.null_count = chunk.null_count;
   stat.has_minmax = chunk.has_minmax;
   stat.has_sum    = [&]() {
-    if (!chunk.has_minmax) return false;
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
+      if (!chunk.has_minmax) { return true; }
       return std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
              std::numeric_limits<E>::lowest() / chunk.non_nulls <=
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index a3336258d3e..51aab9faeba 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/detail/convert/fixed_point.cuh>
-#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -200,62 +200,19 @@ struct from_fixed_point_fn {
   size_type* d_offsets{};
   char* d_chars{};
 
-  /**
-   * @brief Calculates the size of the string required to convert the element, in base-10 format.
-   *
-   * Output format is [-]integer.fraction
-   */
-  __device__ int32_t compute_output_size(DecimalType value)
-  {
-    auto const scale = d_decimals.type().scale();
-
-    if (scale >= 0) return count_digits(value) + scale;
-
-    auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
-    auto const fraction  = count_digits(abs_value % exp_ten);
-    auto const num_zeros = std::max(0, (-scale - fraction));
-    return static_cast<int32_t>(value < 0) +    // sign if negative
-           count_digits(abs_value / exp_ten) +  // integer
-           1 +                                  // decimal point
-           num_zeros +                          // zeros padding
-           fraction;                            // size of fraction
-  }
-
   /**
    * @brief Converts a decimal element into a string.
    *
    * The value is converted into base-10 digits [0-9]
    * plus the decimal point and a negative sign prefix.
    */
-  __device__ void decimal_to_string(size_type idx)
+  __device__ void fixed_point_element_to_string(size_type idx)
   {
     auto const value = d_decimals.element<DecimalType>(idx);
     auto const scale = d_decimals.type().scale();
     char* d_buffer   = d_chars + d_offsets[idx];
 
-    if (scale >= 0) {
-      d_buffer += integer_to_string(value, d_buffer);
-      thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; });  // add zeros
-      return;
-    }
-
-    // scale < 0
-    // write format:   [-]integer.fraction
-    // where integer  = abs(value) / (10^abs(scale))
-    //       fraction = abs(value) % (10^abs(scale))
-    if (value < 0) *d_buffer++ = '-';  // add sign
-    auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
-    auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
-
-    d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
-    *d_buffer++ = '.';                                             // add decimal point
-
-    thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
-    d_buffer += num_zeros;
-
-    integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
+    fixed_point_to_string(value, scale, d_buffer);
   }
 
   __device__ void operator()(size_type idx)
@@ -265,9 +222,10 @@ struct from_fixed_point_fn {
       return;
     }
     if (d_chars != nullptr) {
-      decimal_to_string(idx);
+      fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
+      d_offsets[idx] =
+        fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
 };
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cff7b1cf081..890ef914713 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -976,6 +976,10 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest)
 TEST_F(OrcStatisticsTest, Basic)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto ts_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; });
+  auto dec_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; });
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   std::vector<char const*> strings{
@@ -986,11 +990,17 @@ TEST_F(OrcStatisticsTest, Basic)
     sequence, sequence + num_rows, validity);
   column_wrapper<float, typename decltype(sequence)::value_type> col2(
     sequence, sequence + num_rows, validity);
-  column_wrapper<cudf::string_view> col3{strings.begin(), strings.end()};
-  column_wrapper<bool, typename decltype(sequence)::value_type> col4(sequence, sequence + num_rows);
-  column_wrapper<cudf::timestamp_s, typename decltype(sequence)::value_type> col5(
-    sequence, sequence + num_rows, validity);
-  table_view expected({col1, col2, col3, col4, col5});
+  str_col col3{strings.begin(), strings.end()};
+  column_wrapper<cudf::timestamp_ns, typename decltype(sequence)::value_type> col4(
+    ts_sequence, ts_sequence + num_rows, validity);
+  column_wrapper<cudf::timestamp_us, typename decltype(sequence)::value_type> col5(
+    ts_sequence, ts_sequence + num_rows, validity);
+  bool_col col6({true, true, true, true, true, false, false, false, false}, validity);
+
+  cudf::test::fixed_point_column_wrapper<int64_t> col7(
+    dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1});
+
+  table_view expected({col1, col2, col3, col4, col5, col6, col7});
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
 
@@ -1000,16 +1010,21 @@ TEST_F(OrcStatisticsTest, Basic)
 
   auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
 
-  auto const expected_column_names =
-    std::vector<std::string>{"", "_col0", "_col1", "_col2", "_col3", "_col4"};
+  auto expected_column_names = std::vector<std::string>{""};
+  std::generate_n(
+    std::back_inserter(expected_column_names),
+    expected.num_columns(),
+    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
   EXPECT_EQ(stats.column_names, expected_column_names);
 
   auto validate_statistics = [&](std::vector<cudf::io::column_statistics> const& stats) {
+    ASSERT_EQ(stats.size(), expected.num_columns() + 1);
     auto& s0 = stats[0];
     EXPECT_EQ(*s0.number_of_values, 9ul);
 
     auto& s1 = stats[1];
     EXPECT_EQ(*s1.number_of_values, 4ul);
+    EXPECT_TRUE(*s1.has_null);
     auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
     EXPECT_EQ(*ts1.minimum, 1);
     EXPECT_EQ(*ts1.maximum, 7);
@@ -1017,30 +1032,55 @@ TEST_F(OrcStatisticsTest, Basic)
 
     auto& s2 = stats[2];
     EXPECT_EQ(*s2.number_of_values, 4ul);
+    EXPECT_TRUE(*s2.has_null);
     auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
     EXPECT_EQ(*ts2.minimum, 1.);
     EXPECT_EQ(*ts2.maximum, 7.);
-    // No sum ATM, filed #7087
-    ASSERT_FALSE(ts2.sum);
+    EXPECT_EQ(*ts2.sum, 16.);
 
     auto& s3 = stats[3];
     EXPECT_EQ(*s3.number_of_values, 9ul);
+    EXPECT_FALSE(*s3.has_null);
     auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
     EXPECT_EQ(*ts3.minimum, "Friday");
     EXPECT_EQ(*ts3.maximum, "Wednesday");
     EXPECT_EQ(*ts3.sum, 58ul);
 
     auto& s4 = stats[4];
-    EXPECT_EQ(*s4.number_of_values, 9ul);
-    EXPECT_EQ(std::get<cudf::io::bucket_statistics>(s4.type_specific_stats).count[0], 8ul);
+    EXPECT_EQ(*s4.number_of_values, 4ul);
+    EXPECT_TRUE(*s4.has_null);
+    auto& ts4 = std::get<cudf::io::timestamp_statistics>(s4.type_specific_stats);
+    EXPECT_EQ(*ts4.minimum, -4);
+    EXPECT_EQ(*ts4.maximum, 3);
+    EXPECT_EQ(*ts4.minimum_utc, -4);
+    EXPECT_EQ(*ts4.maximum_utc, 3);
+    EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_EQ(*ts4.maximum_nanos, 6);
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
+    EXPECT_TRUE(*s5.has_null);
     auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
-    EXPECT_EQ(*ts5.minimum_utc, 1000);
-    EXPECT_EQ(*ts5.maximum_utc, 7000);
-    ASSERT_FALSE(ts5.minimum);
-    ASSERT_FALSE(ts5.maximum);
+    EXPECT_EQ(*ts5.minimum, -3001);
+    EXPECT_EQ(*ts5.maximum, 3000);
+    EXPECT_EQ(*ts5.minimum_utc, -3001);
+    EXPECT_EQ(*ts5.maximum_utc, 3000);
+    EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+
+    auto& s6 = stats[6];
+    EXPECT_EQ(*s6.number_of_values, 4ul);
+    EXPECT_TRUE(*s6.has_null);
+    auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
+    EXPECT_EQ(ts6.count[0], 2);
+
+    auto& s7 = stats[7];
+    EXPECT_EQ(*s7.number_of_values, 9ul);
+    EXPECT_FALSE(*s7.has_null);
+    auto& ts7 = std::get<cudf::io::decimal_statistics>(s7.type_specific_stats);
+    EXPECT_EQ(*ts7.minimum, "0.0");
+    EXPECT_EQ(*ts7.maximum, "800.8");
+    EXPECT_EQ(*ts7.sum, "3603.6");
   };
 
   validate_statistics(stats.file_stats);
@@ -1259,9 +1299,8 @@ TEST_F(OrcStatisticsTest, Overflow)
 
 TEST_F(OrcStatisticsTest, HasNull)
 {
-  // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so
-  // we're embedding a file created using pyorc
-  //
+  // This test can now be implemented with libcudf; keeping the pyorc version to keep the test
+  // inputs diversified
   // Method to create file:
   // >>> import pyorc
   // >>> output = open("./temp.orc", "wb")
@@ -1861,4 +1900,38 @@ TEST_F(OrcWriterTest, EmptyChildStringColumn)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+template <typename T>
+void check_all_null_stats(cudf::io::column_statistics const& stats)
+{
+  EXPECT_EQ(stats.number_of_values, 0);
+  EXPECT_TRUE(stats.has_null);
+
+  auto const ts = std::get<T>(stats.type_specific_stats);
+  EXPECT_FALSE(ts.minimum.has_value());
+  EXPECT_FALSE(ts.maximum.has_value());
+  EXPECT_TRUE(ts.sum.has_value());
+  EXPECT_EQ(*ts.sum, 0);
+}
+
+TEST_F(OrcStatisticsTest, AllNulls)
+{
+  float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls());
+  int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls());
+  str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls());
+
+  cudf::table_view expected({int_col, double_col, string_col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+
+  check_all_null_stats<cudf::io::integer_statistics>(stats.file_stats[1]);
+  check_all_null_stats<cudf::io::double_statistics>(stats.file_stats[2]);
+  check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index aafc8831bf4..07aa5430f4f 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -633,16 +633,19 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     for col in gdf:
         if "minimum" in file_stats[0][col]:
             stats_min = file_stats[0][col]["minimum"]
-            actual_min = gdf[col].min()
-            assert normalized_equals(actual_min, stats_min)
+            if stats_min is not None:
+                actual_min = gdf[col].min()
+                assert normalized_equals(actual_min, stats_min)
         if "maximum" in file_stats[0][col]:
             stats_max = file_stats[0][col]["maximum"]
-            actual_max = gdf[col].max()
-            assert normalized_equals(actual_max, stats_max)
+            if stats_max is not None:
+                actual_max = gdf[col].max()
+                assert normalized_equals(actual_max, stats_max)
         if "number_of_values" in file_stats[0][col]:
             stats_num_vals = file_stats[0][col]["number_of_values"]
-            actual_num_vals = gdf[col].count()
-            assert stats_num_vals == actual_num_vals
+            if stats_num_vals is not None:
+                actual_num_vals = gdf[col].count()
+                assert stats_num_vals == actual_num_vals
 
     # compare stripe statistics with actual min/max
     for stripe_idx in range(0, orc_file.nstripes):
@@ -651,21 +654,24 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
         stripe_df = cudf.DataFrame(stripe.to_pandas())
         for col in stripe_df:
             if "minimum" in stripes_stats[stripe_idx][col]:
-                actual_min = stripe_df[col].min()
                 stats_min = stripes_stats[stripe_idx][col]["minimum"]
-                assert normalized_equals(actual_min, stats_min)
+                if stats_min is not None:
+                    actual_min = stripe_df[col].min()
+                    assert normalized_equals(actual_min, stats_min)
 
             if "maximum" in stripes_stats[stripe_idx][col]:
-                actual_max = stripe_df[col].max()
                 stats_max = stripes_stats[stripe_idx][col]["maximum"]
-                assert normalized_equals(actual_max, stats_max)
+                if stats_max is not None:
+                    actual_max = stripe_df[col].max()
+                    assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
                 stats_num_vals = stripes_stats[stripe_idx][col][
                     "number_of_values"
                 ]
-                actual_num_vals = stripe_df[col].count()
-                assert stats_num_vals == actual_num_vals
+                if stats_num_vals is not None:
+                    actual_num_vals = stripe_df[col].count()
+                    assert stats_num_vals == actual_num_vals
 
 
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
@@ -733,16 +739,19 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     for col in expect:
         if "minimum" in file_stats[0][col]:
             stats_min = file_stats[0][col]["minimum"]
-            actual_min = expect[col].min()
-            assert normalized_equals(actual_min, stats_min)
+            if stats_min is not None:
+                actual_min = expect[col].min()
+                assert normalized_equals(actual_min, stats_min)
         if "maximum" in file_stats[0][col]:
             stats_max = file_stats[0][col]["maximum"]
-            actual_max = expect[col].max()
-            assert normalized_equals(actual_max, stats_max)
+            if stats_max is not None:
+                actual_max = expect[col].max()
+                assert normalized_equals(actual_max, stats_max)
         if "number_of_values" in file_stats[0][col]:
             stats_num_vals = file_stats[0][col]["number_of_values"]
-            actual_num_vals = expect[col].count()
-            assert stats_num_vals == actual_num_vals
+            if stats_num_vals is not None:
+                actual_num_vals = expect[col].count()
+                assert stats_num_vals == actual_num_vals
 
     # compare stripe statistics with actual min/max
     for stripe_idx in range(0, orc_file.nstripes):
@@ -751,21 +760,24 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
         stripe_df = cudf.DataFrame(stripe.to_pandas())
         for col in stripe_df:
             if "minimum" in stripes_stats[stripe_idx][col]:
-                actual_min = stripe_df[col].min()
                 stats_min = stripes_stats[stripe_idx][col]["minimum"]
-                assert normalized_equals(actual_min, stats_min)
+                if stats_min is not None:
+                    actual_min = stripe_df[col].min()
+                    assert normalized_equals(actual_min, stats_min)
 
             if "maximum" in stripes_stats[stripe_idx][col]:
-                actual_max = stripe_df[col].max()
                 stats_max = stripes_stats[stripe_idx][col]["maximum"]
-                assert normalized_equals(actual_max, stats_max)
+                if stats_max is not None:
+                    actual_max = stripe_df[col].max()
+                    assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
                 stats_num_vals = stripes_stats[stripe_idx][col][
                     "number_of_values"
                 ]
-                actual_num_vals = stripe_df[col].count()
-                assert stats_num_vals == actual_num_vals
+                if stats_num_vals is not None:
+                    actual_num_vals = stripe_df[col].count()
+                    assert stats_num_vals == actual_num_vals
 
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])

From bdc1f3a6e1f383cd689ba8e92903b89e49cdb8d8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 18 Sep 2023 19:34:29 -0400
Subject: [PATCH 12/23] Expose streams in public strings case APIs (#14056)

Add stream parameter to public strings APIs:
- `cudf::strings::capitalize()`
- `cudf::strings::title()`
- `cudf::strings::is_title()`
- `cudf::strings::to_lower()`
- `cudf::strings::to_upper()`
- `cudf::strings::swapcase()`

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14056
---
 cpp/include/cudf/strings/capitalize.hpp | 28 ++++++++-----
 cpp/include/cudf/strings/case.hpp       |  8 +++-
 cpp/src/strings/capitalize.cu           |  9 ++--
 cpp/src/strings/case.cu                 |  9 ++--
 cpp/tests/CMakeLists.txt                |  1 +
 cpp/tests/streams/strings/case_test.cpp | 55 +++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 18 deletions(-)
 create mode 100644 cpp/tests/streams/strings/case_test.cpp

diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 6d01ab047ba..57375e9ac6a 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,16 +50,18 @@ namespace strings {
  *
  * Any null string entries return corresponding null output column entries.
  *
- * @throw cudf::logic_error if `delimiter.is_valid()` is  `false`.
+ * @throw cudf::logic_error if `delimiter.is_valid()` is `false`.
  *
- * @param input String column.
- * @param delimiters Characters for identifying words to capitalize.
+ * @param input String column
+ * @param delimiters Characters for identifying words to capitalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of strings capitalized from the input column.
+ * @return Column of strings capitalized from the input column
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
-  string_scalar const& delimiters     = string_scalar(""),
+  string_scalar const& delimiters     = string_scalar("", true, cudf::get_default_stream()),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,14 +85,16 @@ std::unique_ptr<column> capitalize(
  *
  * Any null string entries return corresponding null output column entries.
  *
- * @param input String column.
- * @param sequence_type The character type that is used when identifying words.
+ * @param input String column
+ * @param sequence_type The character type that is used when identifying words
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of titled strings.
+ * @return Column of titled strings
  */
 std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -112,12 +116,14 @@ std::unique_ptr<column> title(
  *
  * Any null string entries result in corresponding null output column entries.
  *
- * @param input String column.
+ * @param input String column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of type BOOL8.
+ * @return Column of type BOOL8
  */
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 06ba4f8d882..94191686a92 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,11 +38,13 @@ namespace strings {
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -55,11 +57,13 @@ std::unique_ptr<column> to_lower(
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -73,11 +77,13 @@ std::unique_ptr<column> to_upper(
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 4e248922702..c555031b588 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -287,25 +287,28 @@ std::unique_ptr<column> is_title(strings_column_view const& input,
 
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiter,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::capitalize(input, delimiter, cudf::get_default_stream(), mr);
+  return detail::capitalize(input, delimiter, stream, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::title(input, sequence_type, cudf::get_default_stream(), mr);
+  return detail::title(input, sequence_type, stream, mr);
 }
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_title(input, cudf::get_default_stream(), mr);
+  return detail::is_title(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index c5fe7a19f53..8f4c2ee574a 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -310,24 +310,27 @@ std::unique_ptr<column> swapcase(strings_column_view const& strings,
 // APIs
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_lower(strings, cudf::get_default_stream(), mr);
+  return detail::to_lower(strings, stream, mr);
 }
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_upper(strings, cudf::get_default_stream(), mr);
+  return detail::to_upper(strings, stream, mr);
 }
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::swapcase(strings, cudf::get_default_stream(), mr);
+  return detail::swapcase(strings, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a69dc9bf2f8..4923ef5c903 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/strings/case_test.cpp b/cpp/tests/streams/strings/case_test.cpp
new file mode 100644
index 00000000000..df3eabd773a
--- /dev/null
+++ b/cpp/tests/streams/strings/case_test.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/capitalize.hpp>
+#include <cudf/strings/case.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class StringsCaseTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsCaseTest, LowerUpper)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"",
+                                        "The quick brown fox",
+                                        "jumps over the lazy dog.",
+                                        "all work and no play makes Jack a dull boy",
+                                        R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+  auto view = cudf::strings_column_view(input);
+
+  cudf::strings::to_lower(view, cudf::test::get_default_stream());
+  cudf::strings::to_upper(view, cudf::test::get_default_stream());
+  cudf::strings::swapcase(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCaseTest, Capitalize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"",
+                                        "The Quick Brown Fox",
+                                        "jumps over the lazy dog",
+                                        "all work and no play makes Jack a dull boy"});
+  auto view = cudf::strings_column_view(input);
+
+  auto const delimiter = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  cudf::strings::capitalize(view, delimiter, cudf::test::get_default_stream());
+  cudf::strings::is_title(view, cudf::test::get_default_stream());
+  cudf::strings::title(
+    view, cudf::strings::string_character_types::ALPHA, cudf::test::get_default_stream());
+}

From c016b58b24e63468e9110a6ca82adfc5fd61202d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Sep 2023 07:50:20 -0500
Subject: [PATCH 13/23] Update to clang 16.0.6. (#14120)

This PR updates cudf to use clang 16.0.6. The previous version 16.0.1 has some minor formatting issues affecting several RAPIDS repos.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14120
---
 .pre-commit-config.yaml                       |   2 +-
 cpp/benchmarks/iterator/iterator.cu           |   2 +-
 .../stream_compaction/apply_boolean_mask.cpp  |   4 +-
 cpp/benchmarks/string/char_types.cpp          |   2 +-
 cpp/benchmarks/string/extract.cpp             |   2 +-
 .../cudf/column/column_device_view.cuh        |   2 +-
 cpp/include/cudf/detail/copy_if.cuh           |   2 +-
 cpp/include/cudf/detail/indexalator.cuh       |   4 +-
 cpp/include/cudf/detail/join.hpp              |   4 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   2 +-
 cpp/include/cudf/groupby.hpp                  |   4 +-
 cpp/include/cudf/io/csv.hpp                   |   2 +-
 cpp/include/cudf/io/json.hpp                  |   2 +-
 cpp/include/cudf/strings/detail/utf8.hpp      |  36 ++--
 cpp/include/cudf/table/row_operators.cuh      |   4 +-
 cpp/include/cudf/table/table_view.hpp         |   2 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |   2 +-
 cpp/include/cudf_test/base_fixture.hpp        |   4 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   2 +-
 cpp/scripts/run-clang-tidy.py                 |   2 +-
 cpp/src/copying/contiguous_split.cu           |   8 +-
 cpp/src/groupby/sort/functors.hpp             |  10 +-
 cpp/src/io/avro/avro_gpu.cu                   |   2 +-
 cpp/src/io/comp/cpu_unbz2.cpp                 |   2 +-
 cpp/src/io/comp/debrotli.cu                   |   4 +-
 cpp/src/io/comp/gpuinflate.cu                 |  18 +-
 cpp/src/io/comp/uncomp.cpp                    |  10 +-
 cpp/src/io/comp/unsnap.cu                     |   2 +-
 cpp/src/io/json/json_column.cu                |   2 +-
 cpp/src/io/json/nested_json_gpu.cu            | 160 +++++++++---------
 cpp/src/io/orc/orc_gpu.hpp                    |   2 +-
 cpp/src/io/orc/stripe_data.cu                 |   4 +-
 .../io/parquet/compact_protocol_reader.cpp    |   2 +-
 .../io/parquet/compact_protocol_writer.cpp    |   2 +-
 cpp/src/io/parquet/delta_binary.cuh           |  20 +--
 cpp/src/io/parquet/page_delta_decode.cu       |   2 +-
 cpp/src/io/parquet/parquet.hpp                |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  22 +--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   2 +-
 cpp/src/join/join.cu                          |   4 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |   2 +-
 .../rolling/detail/rolling_collect_list.cuh   |   2 +-
 cpp/src/strings/char_types/char_types.cu      |   4 +-
 cpp/src/strings/convert/convert_datetime.cu   |   6 +-
 cpp/src/strings/convert/convert_durations.cu  |   2 +-
 cpp/src/strings/convert/convert_floats.cu     |   6 +-
 cpp/src/strings/convert/convert_integers.cu   |   2 +-
 cpp/src/strings/convert/convert_ipv4.cu       |   2 +-
 cpp/src/strings/convert/convert_urls.cu       |   4 +-
 cpp/src/strings/json/json_path.cu             |   2 +-
 cpp/src/strings/regex/regcomp.cpp             |  14 +-
 cpp/src/strings/regex/regcomp.h               |   8 +-
 cpp/src/strings/regex/regex.cuh               |  18 +-
 cpp/src/strings/regex/regex.inl               |  10 +-
 cpp/src/strings/replace/replace_re.cu         |   2 +-
 cpp/src/strings/split/partition.cu            |   2 +-
 cpp/src/strings/split/split.cuh               |   2 +-
 cpp/src/strings/split/split_re.cu             |   2 +-
 cpp/src/strings/utilities.cu                  |   6 +-
 cpp/src/text/normalize.cu                     |   4 +-
 cpp/src/text/replace.cu                       |   2 +-
 cpp/src/text/subword/bpe_tokenizer.cu         |   2 +-
 cpp/src/text/subword/load_merges_file.cu      |   2 +-
 cpp/src/text/utilities/tokenize_ops.cuh       |   2 +-
 cpp/tests/groupby/merge_lists_tests.cpp       |   2 +-
 cpp/tests/groupby/merge_sets_tests.cpp        |  12 +-
 cpp/tests/io/parquet_test.cpp                 |   6 +-
 cpp/tests/lists/reverse_tests.cpp             |   8 +-
 .../difference_distinct_tests.cpp             |   2 +-
 .../intersect_distinct_tests.cpp              |   4 +-
 .../set_operations/union_distinct_tests.cpp   |   4 +-
 .../stream_compaction/distinct_tests.cpp      |  10 +-
 .../reshape/interleave_columns_tests.cpp      |   2 +-
 .../rolling/range_rolling_window_test.cpp     |   2 +-
 cpp/tests/sort/segmented_sort_tests.cpp       |   2 +-
 cpp/tests/strings/chars_types_tests.cpp       |  12 +-
 cpp/tests/strings/durations_tests.cpp         |   8 +-
 cpp/tests/utilities/column_utilities.cu       |   2 +-
 78 files changed, 276 insertions(+), 276 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 238e5b44030..7e44091774f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -63,7 +63,7 @@ repos:
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.1
+        rev: v16.0.6
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 7acf24c30a5..dcd13cf62c4 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -145,7 +145,7 @@ void BM_iterator(benchmark::State& state)
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
       if (raw_or_iterator) {
-        raw_stream_bench_cub<T>(hasnull_F, dev_result);       // driven by raw pointer
+        raw_stream_bench_cub<T>(hasnull_F, dev_result);  // driven by raw pointer
       } else {
         iterator_bench_cub<T, false>(hasnull_F, dev_result);  // driven by riterator without nulls
       }
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index a6feaf04842..f78aa9fa654 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -59,8 +59,8 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
   int64_t const column_bytes_in    = column_bytes_out;  // we only read unmasked inputs
 
   int64_t const bytes_read =
-    (column_bytes_in + validity_bytes_in) * num_columns +   // reading columns
-    mask_size;                                              // reading boolean mask
+    (column_bytes_in + validity_bytes_in) * num_columns +  // reading columns
+    mask_size;                                             // reading boolean mask
   int64_t const bytes_written =
     (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
 
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index 8e9e595fcef..59e6245fd41 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -43,7 +43,7 @@ static void bench_char_types(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
   auto chars_size = input.chars_size();
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   if (api_type == "all") {
     state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
   } else {
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index 9e67c5a5b52..135dadabbe4 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -43,7 +43,7 @@ static void bench_extract(nvbench::state& state)
   std::uniform_int_distribution<int> words_dist(0, 999);
   std::vector<std::string> samples(100);  // 100 unique rows of data to reuse
   std::generate(samples.begin(), samples.end(), [&]() {
-    std::string row;                      // build a row of random tokens
+    std::string row;  // build a row of random tokens
     while (static_cast<cudf::size_type>(row.size()) < row_width) {
       row += std::to_string(words_dist(generator)) + " ";
     }
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 05ef21bd750..35851a99822 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1393,7 +1393,7 @@ struct pair_accessor {
  */
 template <typename T, bool has_nulls = false>
 struct pair_rep_accessor {
-  column_device_view const col;               ///< column view of column in device
+  column_device_view const col;  ///< column view of column in device
 
   using rep_type = device_storage_type_t<T>;  ///< representation type
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 1dd91dcd865..ebe7e052b6d 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -133,7 +133,7 @@ __launch_bounds__(block_size) __global__
     if (has_validity) {
       temp_valids[threadIdx.x] = false;  // init shared memory
       if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();                   // wait for init
+      __syncthreads();  // wait for init
     }
 
     if (mask_true) {
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 0ab9da0dbd0..4731c4919e3 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -248,7 +248,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
   friend struct indexalator_factory;
   friend struct base_indexalator<input_indexalator>;  // for CRTP
 
-  using reference = size_type const;                  // this keeps STL and thrust happy
+  using reference = size_type const;  // this keeps STL and thrust happy
 
   input_indexalator()                                    = default;
   input_indexalator(input_indexalator const&)            = default;
@@ -332,7 +332,7 @@ struct output_indexalator : base_indexalator<output_indexalator> {
   friend struct indexalator_factory;
   friend struct base_indexalator<output_indexalator>;  // for CRTP
 
-  using reference = output_indexalator const&;         // required for output iterators
+  using reference = output_indexalator const&;  // required for output iterators
 
   output_indexalator()                                     = default;
   output_indexalator(output_indexalator const&)            = default;
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 6fcf10aef57..b69632c83ca 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -78,8 +78,8 @@ struct hash_join {
   cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
   cudf::table_view _build;                 ///< input table to build the hash map
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_build;                   ///< input table preprocssed for row operators
-  map_type _hash_table;                    ///< hash table built on `_build`
+    _preprocessed_build;  ///< input table preprocssed for row operators
+  map_type _hash_table;   ///< hash table built on `_build`
 
  public:
   /**
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 7c59c2f9194..13d8716c1df 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -829,5 +829,5 @@ using decimal32  = fixed_point<int32_t, Radix::BASE_10>;     ///<  32-bit decima
 using decimal64  = fixed_point<int64_t, Radix::BASE_10>;     ///<  64-bit decimal fixed point
 using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;  ///< 128-bit decimal fixed point
 
-/** @} */                                                    // end of group
+/** @} */  // end of group
 }  // namespace numeric
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 6e575685daa..1c31e8777a8 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -386,8 +386,8 @@ class groupby {
                                                          ///< indicates null order
                                                          ///< of each column
   std::unique_ptr<detail::sort::sort_groupby_helper>
-    _helper;                                             ///< Helper object
-                                                         ///< used by sort based implementation
+    _helper;  ///< Helper object
+              ///< used by sort based implementation
 
   /**
    * @brief Get the sort helper object
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index c84ca7e6c73..b49a13a8ea9 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -213,7 +213,7 @@ class csv_reader_options {
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
-    auto const base_padding  = 1024;       // 1KB
+    auto const base_padding  = 1024;  // 1KB
 
     if (num_columns == 0) {
       // Use flat size if the number of columns is not known
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 15dc2a614ad..d408d249a7f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -207,7 +207,7 @@ class json_reader_options {
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
-    auto const base_padding  = 1024;       // 1KB
+    auto const base_padding  = 1024;  // 1KB
 
     if (num_columns == 0) {
       // Use flat size if the number of columns is not known
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index df8e2885782..e04572535de 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -155,18 +155,18 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
-  if (utf8_char < 0x0000'0080)                // single-byte pass thru
+  if (utf8_char < 0x0000'0080)  // single-byte pass thru
     unchr = utf8_char;
-  else if (utf8_char < 0x0000'E000)           // two bytes
+  else if (utf8_char < 0x0000'E000)  // two bytes
   {
-    unchr = (utf8_char & 0x1F00) >> 2;        // shift and
-    unchr |= (utf8_char & 0x003F);            // unmask
-  } else if (utf8_char < 0x00F0'0000)         // three bytes
+    unchr = (utf8_char & 0x1F00) >> 2;  // shift and
+    unchr |= (utf8_char & 0x003F);      // unmask
+  } else if (utf8_char < 0x00F0'0000)   // three bytes
   {
-    unchr = (utf8_char & 0x0F'0000) >> 4;     // get upper 4 bits
-    unchr |= (utf8_char & 0x00'3F00) >> 2;    // shift and
-    unchr |= (utf8_char & 0x00'003F);         // unmask
-  } else if (utf8_char <= 0xF800'0000u)       // four bytes
+    unchr = (utf8_char & 0x0F'0000) >> 4;   // get upper 4 bits
+    unchr |= (utf8_char & 0x00'3F00) >> 2;  // shift and
+    unchr |= (utf8_char & 0x00'003F);       // unmask
+  } else if (utf8_char <= 0xF800'0000u)     // four bytes
   {
     unchr = (utf8_char & 0x0300'0000) >> 6;   // upper 3 bits
     unchr |= (utf8_char & 0x003F'0000) >> 4;  // next 6 bits
@@ -185,20 +185,20 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
-  if (unchr < 0x0000'0080)               // single byte utf8
+  if (unchr < 0x0000'0080)  // single byte utf8
     utf8 = unchr;
-  else if (unchr < 0x0000'0800)          // double byte utf8
+  else if (unchr < 0x0000'0800)  // double byte utf8
   {
-    utf8 = (unchr << 2) & 0x1F00;        // shift bits for
-    utf8 |= (unchr & 0x3F);              // utf8 encoding
+    utf8 = (unchr << 2) & 0x1F00;  // shift bits for
+    utf8 |= (unchr & 0x3F);        // utf8 encoding
     utf8 |= 0x0000'C080;
-  } else if (unchr < 0x0001'0000)        // triple byte utf8
+  } else if (unchr < 0x0001'0000)  // triple byte utf8
   {
-    utf8 = (unchr << 4) & 0x0F'0000;     // upper 4 bits
-    utf8 |= (unchr << 2) & 0x00'3F00;    // next 6 bits
-    utf8 |= (unchr & 0x3F);              // last 6 bits
+    utf8 = (unchr << 4) & 0x0F'0000;   // upper 4 bits
+    utf8 |= (unchr << 2) & 0x00'3F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);            // last 6 bits
     utf8 |= 0x00E0'8080;
-  } else if (unchr < 0x0011'0000)        // quadruple byte utf8
+  } else if (unchr < 0x0011'0000)  // quadruple byte utf8
   {
     utf8 = (unchr << 6) & 0x0700'0000;   // upper 3 bits
     utf8 |= (unchr << 4) & 0x003F'0000;  // next 6 bits
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 599a85c8a54..4806f96c934 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -105,9 +105,9 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord
 {
   if (lhs_is_null and rhs_is_null) {  // null <? null
     return weak_ordering::EQUIVALENT;
-  } else if (lhs_is_null) {           // null <? x
+  } else if (lhs_is_null) {  // null <? x
     return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS : weak_ordering::GREATER;
-  } else if (rhs_is_null) {           // x <? null
+  } else if (rhs_is_null) {  // x <? null
     return (null_precedence == null_order::AFTER) ? weak_ordering::LESS : weak_ordering::GREATER;
   }
   return weak_ordering::EQUIVALENT;
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 6f779bd457a..b90b2dac012 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -152,7 +152,7 @@ class table_view_base {
 
   table_view_base(table_view_base const&) = default;  ///< Copy constructor
 
-  table_view_base(table_view_base&&) = default;       ///< Move constructor
+  table_view_base(table_view_base&&) = default;  ///< Move constructor
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 98de549c724..329f1fa7754 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -215,5 +215,5 @@ CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
 
 using dictionary32 = dictionary_wrapper<int32_t>;  ///< 32-bit integer indexed dictionary wrapper
 
-/** @} */                                          // end of group
+/** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index b622d7c6b78..06aabbe4e9c 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -331,9 +331,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
     char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
     char const* env_stream_mode =
-      std::getenv("GTEST_CUDF_STREAM_MODE");                        // Overridden by CLI options
+      std::getenv("GTEST_CUDF_STREAM_MODE");  // Overridden by CLI options
     char const* env_stream_error_mode =
-      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");                  // Overridden by CLI options
+      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");  // Overridden by CLI options
     auto default_rmm_mode          = env_rmm_mode ? env_rmm_mode : "pool";
     auto default_stream_mode       = env_stream_mode ? env_stream_mode : "default";
     auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error";
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index ac75f5e9147..72a899d70b4 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -44,7 +44,7 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> bin_offsets;  ///< uint16 column, containing the start index of each
                                               ///< bin in the flattened hash table
   std::unique_ptr<cudf::column>
-    cp_metadata;   ///< uint32 column, The code point metadata table to use for normalization
+    cp_metadata;  ///< uint32 column, The code point metadata table to use for normalization
   std::unique_ptr<cudf::column>
     aux_cp_table;  ///< uint64 column, The auxiliary code point table to use for normalization
 };
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index a617a4c0df7..e5e57dbf562 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -22,7 +22,7 @@
 import shutil
 
 
-EXPECTED_VERSION = "16.0.1"
+EXPECTED_VERSION = "16.0.6"
 VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e1a55ec5419..5ea56a05dcb 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -114,8 +114,8 @@ struct dst_buf_info {
   int bit_shift;           // # of bits to shift right by (for validity buffers)
   size_type valid_count;   // validity count for this block of work
 
-  int src_buf_index;       // source buffer index
-  int dst_buf_index;       // destination buffer index
+  int src_buf_index;  // source buffer index
+  int dst_buf_index;  // destination buffer index
 };
 
 /**
@@ -1384,7 +1384,7 @@ struct chunk_iteration_state {
   std::size_t starting_batch;  ///< Starting batch index for the current iteration
   std::vector<std::size_t> const h_num_buffs_per_iteration;  ///< The count of batches per iteration
   std::vector<std::size_t> const
-    h_size_of_buffs_per_iteration;                           ///< The size in bytes per iteration
+    h_size_of_buffs_per_iteration;  ///< The size in bytes per iteration
 };
 
 std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
@@ -1989,7 +1989,7 @@ struct contiguous_split_state {
   // This can be 1 if `contiguous_split` is just packing and not splitting
   std::size_t const num_partitions;  ///< The number of partitions to produce
 
-  size_type const num_src_bufs;      ///< Number of source buffers including children
+  size_type const num_src_bufs;  ///< Number of source buffers including children
 
   std::size_t const num_bufs;  ///< Number of source buffers including children * number of splits
 
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index c378ac99727..be36956b929 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -94,12 +94,12 @@ struct store_result_functor {
   };
 
  protected:
-  sort::sort_groupby_helper& helper;       ///< Sort helper
-  cudf::detail::result_cache& cache;       ///< cache of results to store into
-  column_view const& values;               ///< Column of values to group and aggregate
+  sort::sort_groupby_helper& helper;  ///< Sort helper
+  cudf::detail::result_cache& cache;  ///< cache of results to store into
+  column_view const& values;          ///< Column of values to group and aggregate
 
-  rmm::cuda_stream_view stream;            ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;     ///< Memory resource to allocate space for results
+  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
+  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
   sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 2c634d9b590..365f6d6875c 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -303,7 +303,7 @@ avro_decode_row(schemadesc_s const* schema,
     // If within an array, check if we reached the last item
     if (array_repeat_count != 0 && array_children <= 0 && cur < end) {
       if (!--array_repeat_count) {
-        i = array_start;                   // Restart at the array parent
+        i = array_start;  // Restart at the array parent
       } else {
         i              = array_start + 1;  // Restart after the array parent
         array_children = schema[array_start].count;
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 7159ff30d7c..a116335b254 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -216,7 +216,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
 
   s->currBlockNo++;
 
-  skipbits(s, 32);                          // block CRC
+  skipbits(s, 32);  // block CRC
 
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 542ca031b7c..8bafd054bdb 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -121,7 +121,7 @@ __inline__ __device__ int brotli_context(int p1, int p2, int lut)
 struct huff_scratch_s {
   uint16_t code_length_histo[16];
   uint8_t code_length_code_lengths[brotli_code_length_codes];
-  int8_t offset[6];                           // offsets in sorted table for each length
+  int8_t offset[6];  // offsets in sorted table for each length
   uint16_t lenvlctab[32];
   uint16_t sorted[brotli_code_length_codes];  // symbols sorted by code length
   int16_t next_symbol[32];
@@ -1298,7 +1298,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t*
   // Reinitialize elements that could have been changed.
   uint32_t i           = 1;
   uint32_t upper_bound = s->mtf_upper_bound;
-  uint32_t* mtf        = &s->mtf[1];   // Make mtf[-1] addressable.
+  uint32_t* mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
   auto* mtf_u8         = reinterpret_cast<uint8_t*>(mtf);
   uint32_t pattern     = 0x0302'0100;  // Little-endian
 
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 42c4fbe7bea..8993815e560 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -124,11 +124,11 @@ struct inflate_state_s {
   uint8_t* outbase;  ///< start of output buffer
   uint8_t* outend;   ///< end of output buffer
   // Input state
-  uint8_t const* cur;       ///< input buffer
-  uint8_t const* end;       ///< end of input buffer
+  uint8_t const* cur;  ///< input buffer
+  uint8_t const* end;  ///< end of input buffer
 
-  uint2 bitbuf;             ///< bit buffer (64-bit)
-  uint32_t bitpos;          ///< position in bit buffer
+  uint2 bitbuf;     ///< bit buffer (64-bit)
+  uint32_t bitpos;  ///< position in bit buffer
 
   int32_t err;              ///< Error status
   int btype;                ///< current block type
@@ -295,7 +295,7 @@ __device__ int construct(
     return 0;                    // complete, but decode() will fail
 
   // check for an over-subscribed or incomplete set of lengths
-  left = 1;                     // one possible code of zero length
+  left = 1;  // one possible code of zero length
   for (len = 1; len <= max_bits; len++) {
     left <<= 1;                 // one more bit, double codes left
     left -= counts[len];        // deduct count from possible codes
@@ -349,8 +349,8 @@ __device__ int init_dynamic(inflate_state_s* s)
   index = 0;
   while (index < nlen + ndist) {
     int symbol = decode(s, s->lencnt, s->lensym);
-    if (symbol < 0) return symbol;    // invalid symbol
-    if (symbol < 16)                  // length in 0..15
+    if (symbol < 0) return symbol;  // invalid symbol
+    if (symbol < 16)                // length in 0..15
       lengths[index++] = symbol;
     else {                            // repeat instruction
       int len = 0;                    // last length to repeat, assume repeating zeros
@@ -358,9 +358,9 @@ __device__ int init_dynamic(inflate_state_s* s)
         if (index == 0) return -5;    // no last length!
         len    = lengths[index - 1];  // last length
         symbol = 3 + getbits(s, 2);
-      } else if (symbol == 17)        // repeat zero 3..10 times
+      } else if (symbol == 17)  // repeat zero 3..10 times
         symbol = 3 + getbits(s, 3);
-      else                            // == 18, repeat zero 11..138 times
+      else  // == 18, repeat zero 11..138 times
         symbol = 11 + getbits(s, 7);
       if (index + symbol > nlen + ndist) return -6;  // too many lengths!
       while (symbol--)                               // repeat last or zero symbol times
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 017fd8abb47..0d2d21333bb 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -28,7 +28,7 @@
 
 #include <cstring>  // memset
 
-#include <zlib.h>   // uncompress
+#include <zlib.h>  // uncompress
 
 using cudf::host_span;
 
@@ -47,7 +47,7 @@ struct gz_file_header_s {
   uint8_t os;         // OS id
 };
 
-struct zip_eocd_s          // end of central directory
+struct zip_eocd_s  // end of central directory
 {
   uint32_t sig;            // 0x0605'4b50
   uint16_t disk_id;        // number of this disk
@@ -59,7 +59,7 @@ struct zip_eocd_s          // end of central directory
                          // number uint16_t comment_len;   // comment length (excluded from struct)
 };
 
-struct zip64_eocdl      // end of central dir locator
+struct zip64_eocdl  // end of central dir locator
 {
   uint32_t sig;         // 0x0706'4b50
   uint32_t disk_start;  // number of the disk with the start of the zip64 end of central directory
@@ -67,7 +67,7 @@ struct zip64_eocdl      // end of central dir locator
   uint32_t num_disks;   // total number of disks
 };
 
-struct zip_cdfh_s        // central directory file header
+struct zip_cdfh_s  // central directory file header
 {
   uint32_t sig;          // 0x0201'4b50
   uint16_t ver;          // version made by
@@ -111,7 +111,7 @@ struct bz2_file_header_s {
 
 struct gz_archive_s {
   gz_file_header_s const* fhdr;
-  uint16_t hcrc16;           // header crc16 if present
+  uint16_t hcrc16;  // header crc16 if present
   uint16_t xlen;
   uint8_t const* fxtra;      // xlen bytes (optional)
   uint8_t const* fname;      // zero-terminated original filename if present
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index a7a1cfd3f9e..c699502317f 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -45,7 +45,7 @@ void __device__ busy_wait(size_t cycles)
 struct unsnap_batch_s {
   int32_t len;  // 1..64 = Number of bytes
   uint32_t
-    offset;     // copy distance if greater than zero or negative of literal offset in byte stream
+    offset;  // copy distance if greater than zero or negative of literal offset in byte stream
 };
 
 /**
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bdad16bd9f1..cabf904f020 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -169,7 +169,7 @@ reduce_to_column_tree(tree_meta_t& tree,
     });
 
   // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);                 // not required
+  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
   rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
   rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
   rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b691eaa8caf..0b49f97597d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -762,18 +762,18 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({}),   // LINE_BREAK
                                                         {ValueBegin}}};  // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
-    {                                                                    /*ROOT*/
-     {ErrorBegin},                                                       // OPENING_BRACE
-     {ErrorBegin},                                                       // OPENING_BRACKET
-     {ErrorBegin},                                                       // CLOSING_BRACE
-     {ErrorBegin},                                                       // CLOSING_BRACKET
-     {ErrorBegin},                                                       // QUOTE
-     {ErrorBegin},                                                       // ESCAPE
-     {ErrorBegin},                                                       // COMMA
-     {ErrorBegin},                                                       // COLON
-     {ErrorBegin},                                                       // WHITE_SPACE
-     nl_tokens({ErrorBegin}),                                            // LINE_BREAK
-     {ErrorBegin},                                                       // OTHER
+    {                          /*ROOT*/
+     {ErrorBegin},             // OPENING_BRACE
+     {ErrorBegin},             // OPENING_BRACKET
+     {ErrorBegin},             // CLOSING_BRACE
+     {ErrorBegin},             // CLOSING_BRACKET
+     {ErrorBegin},             // QUOTE
+     {ErrorBegin},             // ESCAPE
+     {ErrorBegin},             // COMMA
+     {ErrorBegin},             // COLON
+     {ErrorBegin},             // WHITE_SPACE
+     nl_tokens({ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},             // OTHER
      /*LIST*/
      {StructBegin},  // OPENING_BRACE
      {ListBegin},    // OPENING_BRACKET
@@ -799,18 +799,18 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({}),                        // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_LON)] = {
-    {                                      /*ROOT*/
-     {ErrorBegin},                         // OPENING_BRACE
-     {ErrorBegin},                         // OPENING_BRACKET
-     {ErrorBegin},                         // CLOSING_BRACE
-     {ErrorBegin},                         // CLOSING_BRACKET
-     {ErrorBegin},                         // QUOTE
-     {ErrorBegin},                         // ESCAPE
-     {ErrorBegin},                         // COMMA
-     {ErrorBegin},                         // COLON
-     {ValueEnd},                           // WHITE_SPACE
-     nl_tokens({ValueEnd}),                // LINE_BREAK
-     {},                                   // OTHER
+    {                        /*ROOT*/
+     {ErrorBegin},           // OPENING_BRACE
+     {ErrorBegin},           // OPENING_BRACKET
+     {ErrorBegin},           // CLOSING_BRACE
+     {ErrorBegin},           // CLOSING_BRACKET
+     {ErrorBegin},           // QUOTE
+     {ErrorBegin},           // ESCAPE
+     {ErrorBegin},           // COMMA
+     {ErrorBegin},           // COLON
+     {ValueEnd},             // WHITE_SPACE
+     nl_tokens({ValueEnd}),  // LINE_BREAK
+     {},                     // OTHER
      /*LIST*/
      {ErrorBegin},           // OPENING_BRACE
      {ErrorBegin},           // OPENING_BRACKET
@@ -824,17 +824,17 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({ValueEnd}),  // LINE_BREAK
      {},                     // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                      // OPENING_BRACE
-     {ErrorBegin},                                                      // OPENING_BRACKET
-     {ValueEnd, StructMemberEnd, StructEnd},                            // CLOSING_BRACE
-     {ErrorBegin},                                                      // CLOSING_BRACKET
-     {ErrorBegin},                                                      // QUOTE
-     {ErrorBegin},                                                      // ESCAPE
-     {ValueEnd, StructMemberEnd},                                       // COMMA
-     {ErrorBegin},                                                      // COLON
-     {ValueEnd},                                                        // WHITE_SPACE
-     nl_tokens({ValueEnd}),                                             // LINE_BREAK
-     {}}};                                                              // OTHER
+     {ErrorBegin},                            // OPENING_BRACE
+     {ErrorBegin},                            // OPENING_BRACKET
+     {ValueEnd, StructMemberEnd, StructEnd},  // CLOSING_BRACE
+     {ErrorBegin},                            // CLOSING_BRACKET
+     {ErrorBegin},                            // QUOTE
+     {ErrorBegin},                            // ESCAPE
+     {ValueEnd, StructMemberEnd},             // COMMA
+     {ErrorBegin},                            // COLON
+     {ValueEnd},                              // WHITE_SPACE
+     nl_tokens({ValueEnd}),                   // LINE_BREAK
+     {}}};                                    // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{                /*ROOT*/
                                                         {},             // OPENING_BRACE
@@ -974,17 +974,17 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({ErrorBegin}),  // LINE_BREAK
      {ErrorBegin},             // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                                // OPENING_BRACE
-     {ErrorBegin},                                                                // OPENING_BRACKET
-     {StructEnd},                                                                 // CLOSING_BRACE
-     {ErrorBegin},                                                                // CLOSING_BRACKET
-     {StructMemberBegin, FieldNameBegin},                                         // QUOTE
-     {ErrorBegin},                                                                // ESCAPE
-     {ErrorBegin},                                                                // COMMA
-     {ErrorBegin},                                                                // COLON
-     {},                                                                          // WHITE_SPACE
-     nl_tokens({}),                                                               // LINE_BREAK
-     {ErrorBegin}}};                                                              // OTHER
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {StructEnd},                          // CLOSING_BRACE
+     {ErrorBegin},                         // CLOSING_BRACKET
+     {StructMemberBegin, FieldNameBegin},  // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ErrorBegin},                         // COMMA
+     {ErrorBegin},                         // COLON
+     {},                                   // WHITE_SPACE
+     nl_tokens({}),                        // LINE_BREAK
+     {ErrorBegin}}};                       // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1011,17 +1011,17 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({ErrorBegin}),  // LINE_BREAK
                                                         {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},                       // OPENING_BRACE
-                                                        {},                       // OPENING_BRACKET
-                                                        {},                       // CLOSING_BRACE
-                                                        {},                       // CLOSING_BRACKET
-                                                        {FieldNameEnd},           // QUOTE
-                                                        {},                       // ESCAPE
-                                                        {},                       // COMMA
-                                                        {},                       // COLON
-                                                        {},                       // WHITE_SPACE
-                                                        nl_tokens({}),            // LINE_BREAK
-                                                        {}}};                     // OTHER
+                                                        {},              // OPENING_BRACE
+                                                        {},              // OPENING_BRACKET
+                                                        {},              // CLOSING_BRACE
+                                                        {},              // CLOSING_BRACKET
+                                                        {FieldNameEnd},  // QUOTE
+                                                        {},              // ESCAPE
+                                                        {},              // COMMA
+                                                        {},              // COLON
+                                                        {},              // WHITE_SPACE
+                                                        nl_tokens({}),   // LINE_BREAK
+                                                        {}}};            // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1048,17 +1048,17 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({ErrorBegin}),  // LINE_BREAK
                                                         {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},                       // OPENING_BRACE
-                                                        {},                       // OPENING_BRACKET
-                                                        {},                       // CLOSING_BRACE
-                                                        {},                       // CLOSING_BRACKET
-                                                        {},                       // QUOTE
-                                                        {},                       // ESCAPE
-                                                        {},                       // COMMA
-                                                        {},                       // COLON
-                                                        {},                       // WHITE_SPACE
-                                                        nl_tokens({}),            // LINE_BREAK
-                                                        {}}};                     // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {}}};           // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1097,18 +1097,18 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({}),   // LINE_BREAK
                                                         {ErrorBegin}}};  // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                 /*ROOT*/
-                                                        {},              // OPENING_BRACE
-                                                        {},              // OPENING_BRACKET
-                                                        {},              // CLOSING_BRACE
-                                                        {},              // CLOSING_BRACKET
-                                                        {},              // QUOTE
-                                                        {},              // ESCAPE
-                                                        {},              // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {},              // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                /*ROOT*/
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*LIST*/
                                                         {},             // OPENING_BRACE
                                                         {},             // OPENING_BRACKET
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 681cc0fb9d2..9b8df50a22a 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -157,7 +157,7 @@ struct EncChunk {
   uint8_t dtype_len;                 // data type length
   int32_t scale;                     // scale for decimals or timestamps
 
-  uint32_t* dict_index;              // dictionary index from row index
+  uint32_t* dict_index;  // dictionary index from row index
   uint32_t* decimal_offsets;
   orc_column_device_view const* column;
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index b66ca827119..3edcd3d83b2 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -367,14 +367,14 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
       if (zbit) {
         return 5 + (zbit >> 3);  // up to 9x7 bits
       } else if ((sizeof(T) <= 8) || (bytestream_readbyte(bs, pos + 9) <= 0x7f)) {
-        return 10;               // up to 70 bits
+        return 10;  // up to 70 bits
       } else {
         uint64_t next64 = bytestream_readu64(bs, pos + 10);
         zbit            = __ffsll((~next64) & 0x8080'8080'8080'8080ull);
         if (zbit) {
           return 10 + (zbit >> 3);  // Up to 18x7 bits (126)
         } else {
-          return 19;                // Up to 19x7 bits (133)
+          return 19;  // Up to 19x7 bits (133)
         }
       }
     }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 92fcd151925..ae11af92f78 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -168,7 +168,7 @@ bool CompactProtocolReader::read(LogicalType* l)
                     ParquetFieldUnion(2, l->isset.MAP, l->MAP),
                     ParquetFieldUnion(3, l->isset.LIST, l->LIST),
                     ParquetFieldUnion(4, l->isset.ENUM, l->ENUM),
-                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),      // read the struct
+                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),  // read the struct
                     ParquetFieldUnion(6, l->isset.DATE, l->DATE),
                     ParquetFieldUnion(7, l->isset.TIME, l->TIME),            //  read the struct
                     ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP),  //  read the struct
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index b2a89129645..b2c0c97c52d 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -315,7 +315,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val)
   if constexpr (not std::is_empty_v<T>) {
     writer.write(val);  // write the struct if it's not empty
   } else {
-    put_byte(0);        // otherwise, add a stop field
+    put_byte(0);  // otherwise, add a stop field
   }
   current_field_value = field;
 }
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 4fc8b9cfb8e..2382e4aafdf 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -90,16 +90,16 @@ inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end)
 }
 
 struct delta_binary_decoder {
-  uint8_t const* block_start;    // start of data, but updated as data is read
-  uint8_t const* block_end;      // end of data
-  uleb128_t block_size;          // usually 128, must be multiple of 128
-  uleb128_t mini_block_count;    // usually 4, chosen such that block_size/mini_block_count is a
-                                 // multiple of 32
-  uleb128_t value_count;         // total values encoded in the block
-  zigzag128_t last_value;        // last value decoded, initialized to first_value from header
-
-  uint32_t values_per_mb;        // block_size / mini_block_count, must be multiple of 32
-  uint32_t current_value_idx;    // current value index, initialized to 0 at start of block
+  uint8_t const* block_start;  // start of data, but updated as data is read
+  uint8_t const* block_end;    // end of data
+  uleb128_t block_size;        // usually 128, must be multiple of 128
+  uleb128_t mini_block_count;  // usually 4, chosen such that block_size/mini_block_count is a
+                               // multiple of 32
+  uleb128_t value_count;       // total values encoded in the block
+  zigzag128_t last_value;      // last value decoded, initialized to first_value from header
+
+  uint32_t values_per_mb;      // block_size / mini_block_count, must be multiple of 32
+  uint32_t current_value_idx;  // current value index, initialized to 0 at start of block
 
   zigzag128_t cur_min_delta;     // min delta for the block
   uint32_t cur_mb;               // index of the current mini-block within the block
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index e79a479388f..35f33a761be 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -85,7 +85,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
 
     if (t < 2 * warp_size) {  // warp0..1
       target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size);
-    } else {                  // warp2
+    } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
     __syncthreads();
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index a729f28d672..f7318bb9935 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -365,8 +365,8 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> min_values;  // lower bound for values in each page
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
-    BoundaryOrder::UNORDERED;                    // Indicates if min and max values are ordered
-  std::vector<int64_t> null_counts;              // Optional count of null values per page
+    BoundaryOrder::UNORDERED;        // Indicates if min and max values are ordered
+  std::vector<int64_t> null_counts;  // Optional count of null values per page
 };
 
 // bit space we are reserving in column_buffer::user_data
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e82b6abc13d..a3cc37dee4f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -299,7 +299,7 @@ struct ColumnChunkDesc {
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
   int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;   // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index;   // my input column index
   int32_t src_col_schema;  // my schema index in the file
@@ -396,16 +396,16 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
   size_type col_desc_id;
-  PageFragment* fragments;                     //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t* compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const* stats;               //!< Fragment statistics
-  uint32_t bfr_size;                           //!< Uncompressed buffer size
-  uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t max_page_data_size;  //!< Max data size (excluding header) of any page in this chunk
-  uint32_t page_headers_size;   //!< Sum of size of all page headers
-  size_type start_row;          //!< First row of chunk
-  uint32_t num_rows;            //!< Number of rows in chunk
+  PageFragment* fragments;        //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
+  uint8_t* compressed_bfr;        //!< Compressed page data
+  statistics_chunk const* stats;  //!< Fragment statistics
+  uint32_t bfr_size;              //!< Uncompressed buffer size
+  uint32_t compressed_size;       //!< Compressed buffer size
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
+  uint32_t page_headers_size;     //!< Sum of size of all page headers
+  size_type start_row;            //!< First row of chunk
+  uint32_t num_rows;              //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index bde73c3dd96..a2db0de26bb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1673,7 +1673,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     // - we will be doing a chunked read
     gpu::ComputePageSizes(pages,
                           chunks,
-                          0,                     // 0-max size_t. process all possible rows
+                          0,  // 0-max size_t. process all possible rows
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 8210f3114d6..ae025b1a213 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -73,7 +73,7 @@ left_join(table_view const& left_input,
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input, right_input},                // these should match
+    {left_input, right_input},  // these should match
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
@@ -98,7 +98,7 @@ full_join(table_view const& left_input,
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input, right_input},                // these should match
+    {left_input, right_input},  // these should match
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2ce55e10fb1..9e8b75ae3b6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -459,7 +459,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
     int adjusted_w_index       = nearest_w_index;
     if ((last_inserted_index < 0) ||  // if we haven't inserted anything yet
         (nearest_w_index ==
-         last_inserted_index)) {      // if we land in the same bucket as the previous cap
+         last_inserted_index)) {  // if we land in the same bucket as the previous cap
 
       // force the value into this bucket
       adjusted_w_index = (last_inserted_index == group_size - 1)
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 9f74a961e12..39d15ed716f 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -116,7 +116,7 @@ std::unique_ptr<column> create_collect_gather_map(column_view const& child_offse
     thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
     gather_map->mutable_view().template begin<size_type>(),
     [d_offsets =
-       child_offsets.template begin<size_type>(),    // E.g. [0,   2,     5,     8,     11, 13]
+       child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
      d_groups =
        per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
      d_prev = preceding_iter] __device__(auto i) {
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index b87fb80fcc2..0c0ad0ad29e 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -139,9 +139,9 @@ struct filter_chars_fn {
   {
     auto const code_point = detail::utf8_to_codepoint(ch);
     auto const flag       = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
-    if (flag == 0)                       // all types pass unless specifically identified
+    if (flag == 0)  // all types pass unless specifically identified
       return (types_to_remove == ALL_TYPES);
-    if (types_to_keep == ALL_TYPES)      // filter case
+    if (types_to_keep == ALL_TYPES)  // filter case
       return (types_to_remove & flag) != 0;
     return (types_to_keep & flag) == 0;  // keep case
   }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index cca06ca0739..8a953d778ed 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -317,8 +317,8 @@ struct parse_datetime {
           bytes_read -= left;
           break;
         }
-        case 'u': [[fallthrough]];      // day of week: Mon(1)-Sat(6),Sun(7)
-        case 'w': {                     // day of week; Sun(0),Mon(1)-Sat(6)
+        case 'u': [[fallthrough]];  // day of week: Mon(1)-Sat(6),Sun(7)
+        case 'w': {                 // day of week; Sun(0),Mon(1)-Sat(6)
           auto const [weekday, left] = parse_int(ptr, item.length);
           timeparts.weekday          =  // 0 is mapped to 7 for chrono library
             static_cast<int8_t>((item.value == 'w' && weekday == 0) ? 7 : weekday);
@@ -1000,7 +1000,7 @@ struct datetime_formatter_fn {
         case 'S':  // second
           copy_value = timeparts.second;
           break;
-        case 'f':                                 // sub-second
+        case 'f':  // sub-second
         {
           char subsecond_digits[] = "000000000";  // 9 max digits
           int const digits        = [] {
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 863f76b9b98..6ab70825a6b 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -576,7 +576,7 @@ struct parse_duration {
           item_length++;  // :
           timeparts->second = parse_second(ptr + item_length, item_length);
           break;
-        case 'r':         // hh:MM:SS AM/PM
+        case 'r':  // hh:MM:SS AM/PM
           timeparts->hour = parse_hour(ptr, item_length);
           item_length++;  // :
           timeparts->minute = parse_minute(ptr + item_length, item_length);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index ab1e6870937..32167589ab4 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -284,7 +284,7 @@ struct ftos_converter {
       while (pb != buffer)  // reverses the digits
         *ptr++ = *--pb;     // e.g. 54321 -> 12345
     } else
-      *ptr++ = '0';         // always include at least .0
+      *ptr++ = '0';  // always include at least .0
     // exponent
     if (exp10) {
       *ptr++ = 'e';
@@ -310,7 +310,7 @@ struct ftos_converter {
   {
     if (std::isnan(value)) return 3;  // NaN
     bool bneg = false;
-    if (signbit(value)) {             // handles -0.0 too
+    if (signbit(value)) {  // handles -0.0 too
       value = -value;
       bneg  = true;
     }
@@ -337,7 +337,7 @@ struct ftos_converter {
       ++count;  // always include .0
     // exponent
     if (exp10) {
-      count += 2;                  // 'e±'
+      count += 2;  // 'e±'
       if (exp10 < 0) exp10 = -exp10;
       count += (int)(exp10 < 10);  // padding
       while (exp10 > 0) {
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 260c3393f3c..5597d2831c0 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -76,7 +76,7 @@ struct string_to_integer_check_fn {
       auto const digit       = static_cast<IntegerType>(chr - '0');
       auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign;
       if (value > bound_check) return false;
-      value = value* IntegerType{10} + digit;
+      value = value * IntegerType{10} + digit;
     }
 
     return true;
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 4606aba6d17..adb72cb0263 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -197,7 +197,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                       if (d_str.empty()) return false;
                       constexpr int max_ip = 255;  // values must be in [0,255]
                       int ip_vals[4]       = {-1, -1, -1, -1};
-                      int ipv_idx          = 0;    // index into ip_vals
+                      int ipv_idx          = 0;  // index into ip_vals
                       for (auto const ch : d_str) {
                         if ((ch >= '0') && (ch <= '9')) {
                           auto const ip_val    = ip_vals[ipv_idx];
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 71b6c09310e..9efa148cfd2 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -107,9 +107,9 @@ struct url_encoder_fn {
             out_ptr = copy_and_increment(out_ptr, hex, 2);  // add them to the output
           }
         }
-      } else                       // these are to be utf-8 url-encoded
+      } else  // these are to be utf-8 url-encoded
       {
-        uint8_t char_bytes[4];     // holds utf-8 bytes for one character
+        uint8_t char_bytes[4];  // holds utf-8 bytes for one character
         size_type char_width = from_char_utf8(ch, reinterpret_cast<char*>(char_bytes));
         nbytes += char_width * 3;  // '%' plus 2 hex chars per byte (example: é is %C3%A9)
         // process each byte in this current character
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 2d2691e0518..c56752f5429 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -984,7 +984,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       col.size(),
       rmm::device_buffer{0, stream, mr},  // no data
       cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr),
-      col.size());                        // null count
+      col.size());  // null count
   }
 
   constexpr int block_size = 512;
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 5fd098a872e..b7a7f19369d 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -184,9 +184,9 @@ class regex_parser {
   int32_t _id_cclass_d{-1};  // digits [0-9]
   int32_t _id_cclass_D{-1};  // not digits
 
-  char32_t _chr{};           // last lex'd char
-  int32_t _cclass_id{};      // last lex'd class
-  int16_t _min_count{};      // data for counted operators
+  char32_t _chr{};       // last lex'd char
+  int32_t _cclass_id{};  // last lex'd class
+  int16_t _min_count{};  // data for counted operators
   int16_t _max_count{};
 
   std::vector<Item> _items;
@@ -361,9 +361,9 @@ class regex_parser {
         auto [q, n_chr] = next_char();
         if (n_chr == 0) { return 0; }  // malformed: '[x-'
 
-        if (!q && n_chr == ']') {      // handles: '[x-]'
+        if (!q && n_chr == ']') {  // handles: '[x-]'
           literals.push_back(chr);
-          literals.push_back(chr);     // add '-' as literal
+          literals.push_back(chr);  // add '-' as literal
           break;
         }
         // normal case: '[a-z]'
@@ -749,7 +749,7 @@ class regex_parser {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
             out.push_back(regex_parser::Item{item.type == COUNTED ? PLUS : PLUS_LAZY, 0});
-          } else {      // copy it once then append '*'
+          } else {  // copy it once then append '*'
             out.insert(out.end(), begin, end);
             out.push_back(regex_parser::Item{item.type == COUNTED ? STAR : STAR_LAZY, 0});
           }
@@ -1095,7 +1095,7 @@ void reprog::build_start_ids()
     ids.pop();
     reinst const& inst = _insts[id];
     if (inst.type == OR) {
-      if (inst.u2.left_id != id)   // prevents infinite while-loop here
+      if (inst.u2.left_id != id)  // prevents infinite while-loop here
         ids.push(inst.u2.left_id);
       if (inst.u1.right_id != id)  // prevents infinite while-loop here
         ids.push(inst.u1.right_id);
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index aa2cb363b80..ab912ace0df 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -77,16 +77,16 @@ constexpr int32_t NCCLASS_D{1 << 5};  // not CCLASS_D or '\n'
  * @brief Structure of an encoded regex instruction
  */
 struct reinst {
-  int32_t type;       /* operator type or instruction type */
+  int32_t type; /* operator type or instruction type */
   union {
     int32_t cls_id;   /* class pointer */
     char32_t c;       /* character */
     int32_t subid;    /* sub-expression id for RBRA and LBRA */
     int32_t right_id; /* right child of OR */
   } u1;
-  union {             /* regexec relies on these two being in the same union */
-    int32_t left_id;  /* left child of OR */
-    int32_t next_id;  /* next instruction for CAT & LBRA */
+  union {            /* regexec relies on these two being in the same union */
+    int32_t left_id; /* left child of OR */
+    int32_t next_id; /* next instruction for CAT & LBRA */
   } u2;
   int32_t reserved4;
 };
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 19d82380350..c1abbd78b43 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -253,21 +253,21 @@ class reprog_device {
 
   reprog_device(reprog const&);
 
-  int32_t _startinst_id;              // first instruction id
-  int32_t _num_capturing_groups;      // instruction groups
-  int32_t _insts_count;               // number of instructions
-  int32_t _starts_count;              // number of start-insts ids
-  int32_t _classes_count;             // number of classes
-  int32_t _max_insts;                 // for partitioning working memory
+  int32_t _startinst_id;          // first instruction id
+  int32_t _num_capturing_groups;  // instruction groups
+  int32_t _insts_count;           // number of instructions
+  int32_t _starts_count;          // number of start-insts ids
+  int32_t _classes_count;         // number of classes
+  int32_t _max_insts;             // for partitioning working memory
 
   uint8_t const* _codepoint_flags{};  // table of character types
   reinst const* _insts{};             // array of regex instructions
   int32_t const* _startinst_ids{};    // array of start instruction ids
   reclass_device const* _classes{};   // array of regex classes
 
-  std::size_t _prog_size{};           // total size of this instance
-  void* _buffer{};                    // working memory buffer
-  int32_t _thread_count{};            // threads available in working memory
+  std::size_t _prog_size{};  // total size of this instance
+  void* _buffer{};           // working memory buffer
+  int32_t _thread_count{};   // threads available in working memory
 };
 
 /**
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index c5205ae7789..ce12dc17aa4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -146,17 +146,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
   uint32_t codept = utf8_to_codepoint(ch);
   if (codept > 0x00'FFFF) return false;
   int8_t fl = codepoint_flags[codept];
-  if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl)))                    // \w
+  if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl)))  // \w
     return true;
-  if ((builtins & CCLASS_S) && IS_SPACE(fl))                                        // \s
+  if ((builtins & CCLASS_S) && IS_SPACE(fl))  // \s
     return true;
-  if ((builtins & CCLASS_D) && IS_DIGIT(fl))                                        // \d
+  if ((builtins & CCLASS_D) && IS_DIGIT(fl))  // \d
     return true;
   if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl)))  // \W
     return true;
-  if ((builtins & NCCLASS_S) && !IS_SPACE(fl))                                      // \S
+  if ((builtins & NCCLASS_S) && !IS_SPACE(fl))  // \S
     return true;
-  if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl)))                    // \D
+  if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl)))  // \D
     return true;
   //
   return false;
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 460074a5296..81ddb937be5 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -68,7 +68,7 @@ struct replace_regex_fn {
       if (!match) { break; }  // no more matches
 
       auto const [start_pos, end_pos] = match_positions_to_bytes(*match, d_str, last_pos);
-      nbytes += d_repl.size_bytes() - (end_pos - start_pos);               // add new size
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // add new size
 
       if (out_ptr) {                                                       // replace:
                                                                            // i:bbbbsssseeee
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 099f5978992..0c7d119ea38 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -170,7 +170,7 @@ struct rpartition_fn : public partition_fn {
       --itr;
       pos = check_delimiter(idx, d_str, itr);
     }
-    if (pos < 0)                                        // delimiter not found
+    if (pos < 0)  // delimiter not found
     {
       d_indices_left[idx]  = string_index_pair{"", 0};  // two empty
       d_indices_delim[idx] = string_index_pair{"", 0};  // strings
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index e76d8ac1c60..dc0b04af388 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -190,7 +190,7 @@ struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
                                  device_span<size_type const> d_delimiters,
                                  device_span<string_index_pair> d_tokens) const
   {
-    auto const base_ptr    = get_base_ptr();                // d_positions values based on this
+    auto const base_ptr    = get_base_ptr();  // d_positions values based on this
     auto str_ptr           = d_str.data();
     auto const str_end     = str_ptr + d_str.size_bytes();  // end of the string
     auto const token_count = static_cast<size_type>(d_tokens.size());
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9aeb6b69bdc..3be5937297f 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -91,7 +91,7 @@ struct token_reader_fn {
       } else {
         if (direction == split_direction::FORWARD) { break; }  // we are done
         for (auto l = 0; l < token_idx - 1; ++l) {
-          d_result[l] = d_result[l + 1];                       // shift left
+          d_result[l] = d_result[l + 1];  // shift left
         }
         d_result[token_idx - 1] = token;
       }
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 57a868485df..c8c68d19ce6 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -86,9 +86,9 @@ thread_safe_per_context_cache<special_case_mapping> d_special_case_mappings;
 
 }  // namespace
 
-   /**
-    * @copydoc cudf::strings::detail::get_character_flags_table
-    */
+/**
+ * @copydoc cudf::strings::detail::get_character_flags_table
+ */
 character_flags_table_type const* get_character_flags_table()
 {
   return d_character_codepoint_flags.find_or_initialize([&](void) {
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 78dfb6bf1a6..1b07b0785f5 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -70,7 +70,7 @@ struct normalize_spaces_fn {
     cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    char* optr       = buffer;   // running output pointer
+    char* optr       = buffer;  // running output pointer
 
     cudf::size_type nbytes = 0;  // holds the number of bytes per output string
 
@@ -146,7 +146,7 @@ struct codepoint_to_utf8_fn {
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
       uint32_t code_point = *str_cps++;
-      if (code_point < UTF8_1BYTE)         // ASCII range
+      if (code_point < UTF8_1BYTE)  // ASCII range
         *out_ptr++ = static_cast<char>(code_point);
       else if (code_point < UTF8_2BYTE) {  // create two-byte UTF-8
         // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index d122f048a4e..34916e121dc 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -114,7 +114,7 @@ using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_v
  * time to fill in the allocated output buffer for each string.
  */
 struct replace_tokens_fn : base_token_replacer_fn {
-  strings_iterator d_targets_begin;               ///< strings to search for
+  strings_iterator d_targets_begin;  ///< strings to search for
   strings_iterator d_targets_end;
   cudf::column_device_view const d_replacements;  ///< replacement strings
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 4c4f5b3a4b1..13c744ac6bd 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -261,7 +261,7 @@ struct byte_pair_encoding_fn {
           while (itr < end) {
             auto rhs = next_substr(itr, end, d_str);
             if (d_pair.first == lhs && d_pair.second == rhs) {
-              *itr = 0;                   // removes the pair from this string
+              *itr = 0;  // removes the pair from this string
               itr += rhs.size_bytes();
               if (itr >= end) { break; }  // done checking for pairs
               // skip to the next adjacent pair
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 1f1b90b3f49..db6ad2e2dd2 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -93,7 +93,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
     cuco::empty_key{-1},
-    cuco::empty_value{-1},                  // empty value is not used
+    cuco::empty_value{-1},  // empty value is not used
     bpe_equal{input},
     probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index fbd2d1efcff..a84e94a6924 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -230,7 +230,7 @@ struct multi_delimiter_strings_tokenizer {
         });
       if (itr_find != delimiters_end) {  // found delimiter
         auto token_size = static_cast<cudf::size_type>((curr_ptr - data_ptr) - last_pos);
-        if (token_size > 0)              // we only care about non-zero sized tokens
+        if (token_size > 0)  // we only care about non-zero sized tokens
         {
           if (d_str_tokens)
             d_str_tokens[token_idx] = string_index_pair{data_ptr + last_pos, token_size};
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 991473c5023..f2909f870aa 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -374,7 +374,7 @@ TEST_F(GroupbyMergeListsTest, StringsColumnInput)
                 "" /*NULL*/,
                 "" /*NULL*/,
                 "German Shepherd",
-                ""                                                /*NULL*/
+                "" /*NULL*/
               },
               nulls_at({3, 4, 5, 7})},                            // key = "dog"
     lists_col{{"Whale", "" /*NULL*/, "Polar Bear"}, null_at(1)},  // key = "unknown"
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 67ff61563bb..5fc7e68b524 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -333,7 +333,7 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput)
     lists_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}  // key = "dog"
   };
   auto const lists3 = lists_col{
-    lists_col{"Fuji", "Red Delicious"},           // key = "apple"
+    lists_col{"Fuji", "Red Delicious"},  // key = "apple"
     lists_col{{"" /*NULL*/, "Corgi", "German Shepherd", "" /*NULL*/, "Golden Retriever"},
               nulls_at({0, 3})},                  // key = "dog"
     lists_col{{"Seeedless", "Mini"}, no_nulls()}  // key = "water melon"
@@ -343,14 +343,14 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput)
     merge_sets(vcol_views{keys1, keys2, keys3}, vcol_views{lists1, lists2, lists3});
   auto const expected_keys  = strings_col{"apple", "banana", "dog", "unknown", "water melon"};
   auto const expected_lists = lists_col{
-    lists_col{"Fuji", "Honey Bee", "Red Delicious"},                         // key = "apple"
-    lists_col{"Green", "Yellow"},                                            // key = "banana"
+    lists_col{"Fuji", "Honey Bee", "Red Delicious"},  // key = "apple"
+    lists_col{"Green", "Yellow"},                     // key = "banana"
     lists_col{{
                 "Corgi", "German Shepherd", "Golden Retriever", "Poodle", "" /*NULL*/
               },
-              null_at(4)},                                                   // key = "dog"
-    lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)},             // key = "unknown"
-    lists_col{{"Mini", "Seeedless"}, no_nulls()}                             // key = "water melon"
+              null_at(4)},                                        // key = "dog"
+    lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)},  // key = "unknown"
+    lists_col{{"Mini", "Seeedless"}, no_nulls()}                  // key = "water melon"
   };
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *out_keys, verbosity);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 64aca091686..81e0e12eeb9 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2166,7 +2166,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   cudf::io::table_input_metadata metadata(table1);
   metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
   metadata.column_metadata[0].child(1).set_nullability(
-    false);                                           // non-nullable at second (leaf) level
+    false);  // non-nullable at second (leaf) level
   metadata.column_metadata[1].set_nullability(true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet");
@@ -5880,7 +5880,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested)
   EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // map
 
   ASSERT_EQ(out_map_col.num_children(), 1);
-  EXPECT_EQ(out_map_col.child(0).name(), "key_value");       // key_value (named in parquet writer)
+  EXPECT_EQ(out_map_col.child(0).name(), "key_value");  // key_value (named in parquet writer)
   ASSERT_EQ(out_map_col.child(0).num_children(), 2);
   EXPECT_EQ(out_map_col.child(0).child(0).name(), "key");    // key (named in parquet writer)
   EXPECT_EQ(out_map_col.child(0).child(1).name(), "value");  // value (named in parquet writer)
@@ -5897,7 +5897,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested)
   ASSERT_EQ(out_list_col.child(0).num_children(), 1);
 
   auto const& out_list_struct_col = out_list_col.child(0).child(0);
-  EXPECT_EQ(out_list_struct_col.name(), "element");        // elements (named in parquet writer)
+  EXPECT_EQ(out_list_struct_col.name(), "element");  // elements (named in parquet writer)
   EXPECT_EQ(out_list_struct_col.type_kind(),
             cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // struct
   ASSERT_EQ(out_list_struct_col.num_children(), 2);
diff --git a/cpp/tests/lists/reverse_tests.cpp b/cpp/tests/lists/reverse_tests.cpp
index a899d387c3e..00dc13c5812 100644
--- a/cpp/tests/lists/reverse_tests.cpp
+++ b/cpp/tests/lists/reverse_tests.cpp
@@ -370,8 +370,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls)
                                          "Kiwi",
                                          "Cherry",
                                          "Banana",
-                                         "",        /*NULL*/
-                                         "",        /*NULL*/
+                                         "", /*NULL*/
+                                         "", /*NULL*/
                                          "Apple",
                                          "",        /*NULL*/
                                          "Banana",  // end list1
@@ -436,8 +436,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls)
                                          "Kiwi",
                                          "Cherry",
                                          "Banana",
-                                         "",        /*NULL*/
-                                         "",        /*NULL*/
+                                         "", /*NULL*/
+                                         "", /*NULL*/
                                          "Apple",
                                          "",        /*NULL*/
                                          "Banana",  // end list1
diff --git a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
index bf7ebc902ba..84c51f256b7 100644
--- a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
@@ -571,7 +571,7 @@ TEST_F(SetDifferenceTest, InputListsOfNestedStructsHaveNull)
                                        "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple", "Banana",
                                        "Cherry",    "Kiwi",  // end list1
                                        "" /*NULL*/, "Bear",      "Cat",       "Dog",   "Duck",
-                                       "Panda",              // end list2
+                                       "Panda",  // end list2
                                        "ÁÁÁ",       "ÉÉÉÉÉ",     "ÁBC",       "ÁÁÁ",   "ÍÍÍÍÍ",
                                        "" /*NULL*/, "XYZ",
                                        "ÁBC"  // end list3
diff --git a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
index dbccf06036b..11f98af3520 100644
--- a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
@@ -514,7 +514,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull)
                                       null,  // end list1
                                       null,  // end list2
                                       null,
-                                      null   // end list3
+                                      null  // end list3
                                     },
                                     all_nulls()};
       auto grandchild2 = strings_col{{
@@ -522,7 +522,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull)
                                        "Apple",      // end list1
                                        "" /*NULL*/,  // end list2
                                        "ÁÁÁ",
-                                       "ÉÉÉÉÉ"       // end list3
+                                       "ÉÉÉÉÉ"  // end list3
                                      },
                                      nulls_at({0, 2})};
       auto child1      = structs_col{{grandchild1, grandchild2}, null_at(0)};
diff --git a/cpp/tests/lists/set_operations/union_distinct_tests.cpp b/cpp/tests/lists/set_operations/union_distinct_tests.cpp
index 5cc0897351d..e33ea31541b 100644
--- a/cpp/tests/lists/set_operations/union_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/union_distinct_tests.cpp
@@ -560,7 +560,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull)
       auto grandchild2 =
         strings_col{{
                       "" /*NULL*/, "Apple",     "Banana", "Cherry", "Kiwi",  "Banana",    "Cherry",
-                      "Kiwi",                                       // end list1
+                      "Kiwi",  // end list1
                       "" /*NULL*/, "Bear",      "Cat",    "Dog",    "Duck",  "Panda",     "Bear",
                       "Cat",       "Dog",       "Duck",   "Panda",  // end list2
 
@@ -597,7 +597,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull)
         {
           "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple",
           "Apple",     "Banana",    "Cherry",    "Kiwi",      "Banana",    "Cherry",
-          "Kiwi",                                                       // end list1
+          "Kiwi",  // end list1
           "" /*NULL*/, "" /*NULL*/, "Bear",      "Cat",       "Dog",       "Duck",      "Panda",
           "Bear",      "Cat",       "Dog",       "Duck",      "Panda",  // end list2
           "ÁÁÁ",       "ÁÁÁ",       "ÉÉÉÉÉ",     "ÉÉÉÉÉ",     "ÁBC",       "ÁÁÁ",       "ÍÍÍÍÍ",
diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp
index 57d1714c255..fbc637f9315 100644
--- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp
@@ -529,7 +529,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull)
                               2,
                               3,
                               3,
-                              3},     // end list3
+                              3},  // end list3
                              nulls_at({1, 6, 12, 13})};
     auto child2 = strings_col{{       // begin list1
                                "XXX", /*NULL*/
@@ -551,7 +551,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull)
                                "ÁBC",
                                "ÁÁÁ",
                                "ÍÍÍÍÍ",
-                               "",      /*NULL*/
+                               "", /*NULL*/
                                "XYZ",
                                "ÁBC"},  // end list3
                               nulls_at({6, 17})};
@@ -670,7 +670,7 @@ TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull)
                                      "ÁBC",
                                      "ÁÁÁ",
                                      "ÍÍÍÍÍ",
-                                     "",    /*NULL*/
+                                     "", /*NULL*/
                                      "XYZ",
                                      "ÁBC"  // end list3
                                    },
@@ -729,8 +729,8 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists)
                                  floats_lists{3, 4, 5},  // end list2
                                                          // begin list3
                                  floats_lists{},
-                                 floats_lists{},         // end list3
-                                                         // begin list4
+                                 floats_lists{},  // end list3
+                                                  // begin list4
                                  floats_lists{6, 7},
                                  floats_lists{6, 7},
                                  floats_lists{6, 7}};
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index eba6c961bbb..e8ea9d619c5 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -806,7 +806,7 @@ TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedInputListsOfListsWithNulls)
     ListsCol{ListsCol{{null, 11}, null_at(0)},
              ListsCol{{22, null, null}, nulls_at({1, 2})}},  // don't care
     ListsCol{ListsCol{{null, 11}, null_at(0)},
-             ListsCol{{22, null, null}, nulls_at({1, 2})}}   // don't care
+             ListsCol{{22, null, null}, nulls_at({1, 2})}}  // don't care
   };
 
   auto const col1 = cudf::slice(col1_original, {3, 6})[0];
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 585383f28f8..eed9db1fe04 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -91,7 +91,7 @@ struct window_exec {
   ScalarT preceding;             // Preceding window scalar.
   ScalarT following;             // Following window scalar.
   cudf::size_type min_periods = 1;
-};                               // struct window_exec;
+};  // struct window_exec;
 
 struct RangeRollingTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index b3f98eb54b9..da9666cbc74 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -270,7 +270,7 @@ TEST_F(SegmentedSortInt, Sliced)
   column_wrapper<int> expected2{{0, 1, 3, 2, 4, 5, 6}};
   column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 6}};
   // clang-format on
-  auto slice = cudf::slice(col1, {4, 11})[0];          // 7 elements
+  auto slice = cudf::slice(col1, {4, 11})[0];  // 7 elements
   cudf::table_view input{{slice}};
   auto seg_slice = cudf::slice(segments2, {2, 4})[0];  // 2 elements
 
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index a16da41af7a..c595977c269 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -50,17 +50,17 @@ TEST_P(CharsTypes, AllTypes)
                                      "\t\r\n\f "};
 
   bool expecteds[] = {false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, true,  false, false,   // decimal
+                      false, false, false, false, false, true,  false, false,  // decimal
                       false, false, false, false, false, false, false, false,
-                      false, true,  false, true,  false, true,  false, false,   // numeric
+                      false, true,  false, true,  false, true,  false, false,  // numeric
                       false, false, false, false, false, false, false, false,
-                      false, false, false, true,  false, true,  false, false,   // digit
+                      false, false, false, true,  false, true,  false, false,  // digit
                       true,  true,  false, true,  false, false, false, false,
-                      false, false, false, false, false, false, true,  false,   // alpha
+                      false, false, false, false, false, false, true,  false,  // alpha
                       false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, false, false, true,    // space
+                      false, false, false, false, false, false, false, true,  // space
                       false, false, false, true,  false, false, false, false,
-                      false, false, false, false, false, false, false, false,   // upper
+                      false, false, false, false, false, false, false, false,  // upper
                       false, true,  false, false, false, false, false, false,
                       false, false, false, false, false, false, true,  false};  // lower
 
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index 0c7a1ad8042..1902f907f43 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -398,7 +398,7 @@ TEST_F(StringsDurationsTest, ParseSingle)
                                                 "-59",
                                                 "999",
                                                 "-999",
-                                                "",   // error
+                                                "",  // error
                                                 "01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -449,7 +449,7 @@ TEST_F(StringsDurationsTest, ParseMultiple)
                                                 "-59:00:00",
                                                 "999:00:00",
                                                 "-999:00:00",
-                                                "",   // error
+                                                "",  // error
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -503,7 +503,7 @@ TEST_F(StringsDurationsTest, ParseSubsecond)
                                                 "-59:00:00",
                                                 "999:00:00",
                                                 "-999:00:00",
-                                                "",   // error
+                                                "",  // error
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -660,7 +660,7 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier)
                                                  "09:00 AM",  // error
                                                  "",          // error
                                                  "01:01:01",
-                                                 ""};         // error
+                                                 ""};  // error
 
   cudf::test::fixed_width_column_wrapper<cudf::duration_s, int64_t> expected_s3(
     {0,
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index bae402155e9..620e0bfe8de 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -440,7 +440,7 @@ class corresponding_rows_not_equivalent {
 
         // Must handle inf and nan separately
         if (std::isinf(x) || std::isinf(y)) {
-          return x != y;                          // comparison of (inf==inf) returns true
+          return x != y;  // comparison of (inf==inf) returns true
         } else if (std::isnan(x) || std::isnan(y)) {
           return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
         } else {

From 97501d87e2070e8f07eb17b2c5e59742c490c6b1 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 20 Sep 2023 07:42:20 +0530
Subject: [PATCH 14/23] Long string optimization for string column parsing in
 JSON reader (#13803)

closes #13724

In old code, 1 thread per string is allocated for parsing a string column.
For longer strings (>1024), the runtime of 1-thread-per-string to decode is taking too long even for few strings.

In this change, 1 warp per string is used for parsing for strings length <=1024 and 1 block per string for string length >1024. If max string length < 128, 1 thread per string is used as usual.

256 threads_per_block is used for both kernels.
Code for 1-warp-per-string and 1-block-per-string is similar, but only varies with warp-wide and block-wide primitives for reduction and scan operations. shared memory usage will differ slightly too.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13803
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/io/detail/data_casting.cuh   | 431 --------
 cpp/src/io/json/json_column.cu                |  39 +-
 cpp/src/io/json/nested_json_gpu.cu            |  22 +-
 cpp/src/io/json/write_json.cu                 |   3 +-
 cpp/src/io/utilities/data_casting.cu          | 987 ++++++++++++++++++
 cpp/src/io/utilities/parsing_utils.cuh        |  24 +-
 cpp/src/io/utilities/string_parsing.hpp       |  79 ++
 .../{type_inference.cuh => type_inference.cu} |  57 +-
 cpp/tests/io/json_test.cpp                    | 119 +++
 cpp/tests/io/json_type_cast_test.cu           | 189 +++-
 cpp/tests/io/type_inference_test.cu           |  30 +-
 12 files changed, 1395 insertions(+), 587 deletions(-)
 delete mode 100644 cpp/include/cudf/io/detail/data_casting.cuh
 create mode 100644 cpp/src/io/utilities/data_casting.cu
 create mode 100644 cpp/src/io/utilities/string_parsing.hpp
 rename cpp/src/io/utilities/{type_inference.cuh => type_inference.cu} (84%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 900e9eed98e..a84f7bd5224 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -413,11 +413,13 @@ add_library(
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
+  src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
   src/io/utilities/parsing_utils.cu
   src/io/utilities/row_selection.cpp
+  src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
   src/jit/cache.cpp
   src/jit/parser.cpp
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
deleted file mode 100644
index b7ee5e05e96..00000000000
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <io/utilities/parsing_utils.cuh>
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utf8.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cub/cub.cuh>
-
-#include <memory>
-
-namespace cudf::io::json::detail {
-
-// Unicode code point escape sequence
-static constexpr char UNICODE_SEQ = 0x7F;
-
-// Invalid escape sequence
-static constexpr char NON_ESCAPE_CHAR = 0x7E;
-
-// Unicode code point escape sequence prefix comprises '\' and 'u' characters
-static constexpr size_type UNICODE_ESC_PREFIX = 2;
-
-// Unicode code point escape sequence comprises four hex characters
-static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4;
-
-// A unicode code point escape sequence is \uXXXX
-static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT;
-
-static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
-static constexpr auto UTF16_HIGH_SURROGATE_END   = 0xDC00;
-static constexpr auto UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
-static constexpr auto UTF16_LOW_SURROGATE_END    = 0xE000;
-
-/**
- * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or
- * whether type casting failed.
- */
-enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE };
-
-/**
- * @brief Providing additional information about the type casting result.
- */
-struct data_casting_result_info {
-  // Number of bytes written to output
-  size_type bytes;
-  // Whether parsing succeeded, item was parsed to null, or failed
-  data_casting_result result;
-};
-
-/**
- * @brief Returns the character to output for a given escaped character that's following a
- * backslash.
- *
- * @param escaped_char The character following the backslash.
- * @return The character to output for a given character that's following a backslash
- */
-__device__ __forceinline__ char get_escape_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return '"';
-    case '\\': return '\\';
-    case '/': return '/';
-    case 'b': return '\b';
-    case 'f': return '\f';
-    case 'n': return '\n';
-    case 'r': return '\r';
-    case 't': return '\t';
-    case 'u': return UNICODE_SEQ;
-    default: return NON_ESCAPE_CHAR;
-  }
-}
-
-/**
- * @brief Returns the escaped characters for a given character.
- *
- * @param escaped_char The character to escape.
- * @return The escaped characters for a given character.
- */
-__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return {'\\', '"'};
-    case '\\': return {'\\', '\\'};
-    case '/': return {'\\', '/'};
-    case '\b': return {'\\', 'b'};
-    case '\f': return {'\\', 'f'};
-    case '\n': return {'\\', 'n'};
-    case '\r': return {'\\', 'r'};
-    case '\t': return {'\\', 't'};
-    // case 'u': return UNICODE_SEQ;
-    default: return {'\0', escaped_char};
-  }
-}
-/**
- * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
- * \uXXXX.
- *
- * @param str Pointer to the first (most-significant) hex digit
- * @return The parsed hex value if successful, -1 otherwise.
- */
-__device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
-{
-  // Prepare result
-  int32_t result = 0, base = 1;
-  constexpr int32_t hex_radix = 16;
-
-  // Iterate over hex digits right-to-left
-  size_type index = UNICODE_HEX_DIGIT_COUNT;
-  while (index-- > 0) {
-    char const ch = str[index];
-    if (ch >= '0' && ch <= '9') {
-      result += static_cast<int32_t>((ch - '0') + 0) * base;
-      base *= hex_radix;
-    } else if (ch >= 'A' && ch <= 'F') {
-      result += static_cast<int32_t>((ch - 'A') + 10) * base;
-      base *= hex_radix;
-    } else if (ch >= 'a' && ch <= 'f') {
-      result += static_cast<int32_t>((ch - 'a') + 10) * base;
-      base *= hex_radix;
-    } else {
-      return -1;
-    }
-  }
-  return result;
-}
-
-/**
- * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
- * \p out_it
- */
-constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
-{
-  auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
-                                         : strings::detail::from_char_utf8(character, out_it);
-  if (out_it) out_it += bytes;
-  return bytes;
-}
-
-/**
- * @brief Processes a string, replaces escape sequences and optionally strips off the quote
- * characters.
- *
- * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
- * char
- * @param in_begin Iterator to the first item to process
- * @param in_end Iterator to one past the last item to process
- * @param d_buffer Output character buffer to the first item to write
- * @param options Settings for controlling string processing behavior
- * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
- * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
- * parsing succeeded, item was parsed to null, or failed.
- */
-template <typename in_iterator_t>
-__device__ __forceinline__ data_casting_result_info
-process_string(in_iterator_t in_begin,
-               in_iterator_t in_end,
-               char* d_buffer,
-               cudf::io::parse_options_view const& options)
-{
-  int32_t bytes           = 0;
-  auto const num_in_chars = thrust::distance(in_begin, in_end);
-  // String values are indicated by keeping the quote character
-  bool const is_string_value =
-    num_in_chars >= 2LL &&
-    (options.quotechar == '\0' ||
-     (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
-
-  // Copy literal/numeric value
-  if (not is_string_value) {
-    while (in_begin != in_end) {
-      if (d_buffer) *d_buffer++ = *in_begin;
-      ++in_begin;
-      ++bytes;
-    }
-    return {bytes, data_casting_result::PARSING_SUCCESS};
-  }
-  // Whether in the original JSON this was a string value enclosed in quotes
-  // ({"a":"foo"} vs. {"a":1.23})
-  char const backslash_char = '\\';
-
-  // Escape-flag, set after encountering a backslash character
-  bool escape = false;
-
-  // Exclude beginning and ending quote chars from string range
-  if (!options.keepquotes) {
-    ++in_begin;
-    --in_end;
-  }
-
-  // Iterate over the input
-  while (in_begin != in_end) {
-    // Copy single character to output
-    if (!escape) {
-      escape = (*in_begin == backslash_char);
-      if (!escape) {
-        if (d_buffer) *d_buffer++ = *in_begin;
-        ++bytes;
-      }
-      ++in_begin;
-      continue;
-    }
-
-    // Previous char indicated beginning of escape sequence
-    // Reset escape flag for next loop iteration
-    escape = false;
-
-    // Check the character that is supposed to be escaped
-    auto escaped_char = get_escape_char(*in_begin);
-
-    // We escaped an invalid escape character -> "fail"/null for this item
-    if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-
-    // Regular, single-character escape
-    if (escaped_char != UNICODE_SEQ) {
-      if (d_buffer) *d_buffer++ = escaped_char;
-      ++bytes;
-      ++in_begin;
-      continue;
-    }
-
-    // This is an escape sequence of a unicode code point: \uXXXX,
-    // where each X in XXXX represents a hex digit
-    // Skip over the 'u' char from \uXXXX to the first hex digit
-    ++in_begin;
-
-    // Make sure that there's at least 4 characters left from the
-    // input, which are expected to be hex digits
-    if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
-      return {bytes, data_casting_result::PARSING_FAILURE};
-    }
-
-    auto hex_val = parse_unicode_hex(in_begin);
-
-    // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
-    if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-
-    // Skip over the four hex digits
-    thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
-
-    // If this may be a UTF-16 encoded surrogate pair:
-    // we expect another \uXXXX sequence
-    int32_t hex_low_val = 0;
-    if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
-        *in_begin == backslash_char && *thrust::next(in_begin) == 'u') {
-      // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low
-      // surrogate
-      hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2));
-    }
-
-    // This is indeed a UTF16 surrogate pair
-    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
-        hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
-      // Skip over the second \uXXXX sequence
-      thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS);
-
-      // Compute UTF16-encoded code point
-      uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
-                                    (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
-      auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
-      bytes += write_utf8_char(utf8_chars, d_buffer);
-    }
-
-    // Just a single \uXXXX sequence
-    else {
-      auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
-      bytes += write_utf8_char(utf8_chars, d_buffer);
-    }
-  }
-
-  // The last character of the input is a backslash -> "fail"/null for this item
-  if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-  return {bytes, data_casting_result::PARSING_SUCCESS};
-}
-
-template <typename str_tuple_it>
-struct string_parse {
-  str_tuple_it str_tuples;
-  bitmask_type* null_mask;
-  size_type* null_count_data;
-  cudf::io::parse_options_view const options;
-  size_type* d_offsets{};
-  char* d_chars{};
-
-  __device__ void operator()(size_type idx)
-  {
-    if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const in_begin     = str_tuples[idx].first;
-    auto const in_end       = in_begin + str_tuples[idx].second;
-    auto const num_in_chars = str_tuples[idx].second;
-
-    // Check if the value corresponds to the null literal
-    auto const is_null_literal =
-      (!d_chars) &&
-      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-    if (is_null_literal && null_mask != nullptr) {
-      clear_bit(null_mask, idx);
-      atomicAdd(null_count_data, 1);
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-
-    char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
-    if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
-      if (null_mask != nullptr) {
-        clear_bit(null_mask, idx);
-        atomicAdd(null_count_data, 1);
-      }
-      if (!d_chars) d_offsets[idx] = 0;
-    } else {
-      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
-    }
-  }
-};
-/**
- * @brief Parses the data from an iterator of string views, casting it to the given target data type
- *
- * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair
- * @param col_size The total number of items of this column
- * @param col_type The column's target data type
- * @param null_mask A null mask that renders certain items from the input invalid
- * @param options Settings for controlling the processing behavior
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr The resource to be used for device memory allocation
- * @return The column that contains the parsed data
- */
-template <typename str_tuple_it, typename B>
-std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
-                                   size_type col_size,
-                                   data_type col_type,
-                                   B&& null_mask,
-                                   size_type null_count,
-                                   cudf::io::parse_options_view const& options,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-
-  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
-  auto null_count_data = d_null_count.data();
-
-  if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    // this utility calls the functor to build the offsets and chars columns;
-    // the bitmask and null count may be updated by parse failures
-    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-      string_parse<decltype(str_tuples)>{
-        str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
-      col_size,
-      stream,
-      mr);
-
-    return make_strings_column(col_size,
-                               std::move(offsets),
-                               std::move(chars),
-                               d_null_count.value(stream),
-                               std::move(null_mask));
-  }
-
-  auto out_col =
-    make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
-  auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);
-
-  // use existing code (`ConvertFunctor`) to convert values
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    col_size,
-    [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
-      size_type row) {
-      if (col.is_null(row)) { return; }
-      auto const in = str_tuples[row];
-
-      auto const is_null_literal =
-        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
-
-      if (is_null_literal) {
-        col.set_null(row);
-        atomicAdd(null_count_data, 1);
-        return;
-      }
-
-      // If this is a string value, remove quotes
-      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
-
-      auto const is_parsed = cudf::type_dispatcher(col_type,
-                                                   ConvertFunctor{},
-                                                   in_begin,
-                                                   in_end,
-                                                   col.data<char>(),
-                                                   row,
-                                                   col_type,
-                                                   options,
-                                                   false);
-      if (not is_parsed) {
-        col.set_null(row);
-        atomicAdd(null_count_data, 1);
-      }
-    });
-
-  out_col->set_null_count(d_null_count.value(stream));
-
-  return out_col;
-}
-
-}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index cabf904f020..5d7fb9d6b43 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -16,14 +16,13 @@
 
 #include "nested_json.hpp"
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -331,23 +330,27 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<thrust::pair<char const*, size_type>> string_views(num_strings, stream);
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
   thrust::transform(rmm::exec_policy(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
-                    string_views.begin(),
-                    [data = input.data()] __device__(auto const& offsets) {
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
                       // Note: first character for non-field columns
-                      return thrust::make_pair(
-                        data + thrust::get<0>(offsets),
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
                         static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
                     });
 
   cudf::io::parse_options_view options_view{};
   options_view.quotechar  = '\0';  // no quotes
   options_view.keepquotes = true;
-  auto d_column_names     = parse_data(string_views.begin(),
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
                                    num_strings,
                                    data_type{type_id::STRING},
                                    rmm::device_buffer{},
@@ -355,7 +358,7 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
                                    options_view,
                                    stream,
                                    rmm::mr::get_current_device_resource());
-  auto to_host            = [stream](auto const& col) {
+  auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
@@ -763,19 +766,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // TODO how about directly storing pair<char*, size_t> in json_column?
       auto offset_length_it =
         thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
 
       data_type target_type{};
 
@@ -790,12 +780,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // Infer column type, if we don't have an explicit type for it
       else {
         target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, string_ranges_it, col_size, stream);
+          options.json_view(), d_input, offset_length_it, col_size, stream);
       }
 
       auto [result_bitmask, null_count] = make_validity(json_col);
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 0b49f97597d..06ac11485cb 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -19,14 +19,13 @@
 #include <io/fst/logical_stack.cuh>
 #include <io/fst/lookup_tables.cuh>
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/detail/valid_if.cuh>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/table/table.hpp>
@@ -1949,20 +1948,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto offset_length_it =
         thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin());
 
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
       data_type target_type{};
 
       if (schema.has_value()) {
@@ -1978,7 +1963,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
         target_type =
           cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(),
                                             d_input,
-                                            string_ranges_it,
+                                            offset_length_it,
                                             col_size,
                                             stream);
       }
@@ -1986,7 +1971,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto [result_bitmask, null_count] = make_validity(json_col);
 
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1e44522ed33..2d363c51fce 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -20,6 +20,7 @@
  */
 
 #include <io/csv/durations.hpp>
+#include <io/utilities/parsing_utils.cuh>
 #include <lists/utilities.hpp>
 
 #include <cudf/column/column_device_view.cuh>
@@ -27,9 +28,9 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
new file mode 100644
index 00000000000..1772e5e43fa
--- /dev/null
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -0,0 +1,987 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
+#include <cub/cub.cuh>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf::io::json::detail {
+
+constexpr auto SINGLE_THREAD_THRESHOLD = 128;
+constexpr auto WARP_THRESHOLD          = 128 * 128;  // 16K
+
+// Unicode code point escape sequence
+static constexpr char UNICODE_SEQ = 0x7F;
+
+// Invalid escape sequence
+static constexpr char NON_ESCAPE_CHAR = 0x7E;
+
+// Unicode code point escape sequence prefix comprises '\' and 'u' characters
+static constexpr size_type UNICODE_ESC_PREFIX = 2;
+
+// Unicode code point escape sequence comprises four hex characters
+static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4;
+
+// A unicode code point escape sequence is \uXXXX
+static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT;
+
+static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
+static constexpr auto UTF16_HIGH_SURROGATE_END   = 0xDC00;
+static constexpr auto UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
+static constexpr auto UTF16_LOW_SURROGATE_END    = 0xE000;
+
+/**
+ * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or
+ * whether type casting failed.
+ */
+enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE };
+
+/**
+ * @brief Providing additional information about the type casting result.
+ */
+struct data_casting_result_info {
+  // Number of bytes written to output
+  size_type bytes;
+  // Whether parsing succeeded, item was parsed to null, or failed
+  data_casting_result result;
+};
+
+/**
+ * @brief Returns the character to output for a given escaped character that's following a
+ * backslash.
+ *
+ * @param escaped_char The character following the backslash.
+ * @return The character to output for a given character that's following a backslash
+ */
+__device__ __forceinline__ char get_escape_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return '"';
+    case '\\': return '\\';
+    case '/': return '/';
+    case 'b': return '\b';
+    case 'f': return '\f';
+    case 'n': return '\n';
+    case 'r': return '\r';
+    case 't': return '\t';
+    case 'u': return UNICODE_SEQ;
+    default: return NON_ESCAPE_CHAR;
+  }
+}
+
+/**
+ * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
+ * \uXXXX.
+ *
+ * @param str Pointer to the first (most-significant) hex digit
+ * @return The parsed hex value if successful, -1 otherwise.
+ */
+__device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
+{
+  // Prepare result
+  int32_t result = 0, base = 1;
+  constexpr int32_t hex_radix = 16;
+
+  // Iterate over hex digits right-to-left
+  size_type index = UNICODE_HEX_DIGIT_COUNT;
+  while (index-- > 0) {
+    char const ch = str[index];
+    if (ch >= '0' && ch <= '9') {
+      result += static_cast<int32_t>((ch - '0') + 0) * base;
+      base *= hex_radix;
+    } else if (ch >= 'A' && ch <= 'F') {
+      result += static_cast<int32_t>((ch - 'A') + 10) * base;
+      base *= hex_radix;
+    } else if (ch >= 'a' && ch <= 'f') {
+      result += static_cast<int32_t>((ch - 'a') + 10) * base;
+      base *= hex_radix;
+    } else {
+      return -1;
+    }
+  }
+  return result;
+}
+
+/**
+ * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
+ * \p out_it
+ */
+constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
+{
+  auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
+                                         : strings::detail::from_char_utf8(character, out_it);
+  if (out_it) out_it += bytes;
+  return bytes;
+}
+
+/**
+ * @brief Processes a string, replaces escape sequences and optionally strips off the quote
+ * characters.
+ *
+ * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
+ * char
+ * @param in_begin Iterator to the first item to process
+ * @param in_end Iterator to one past the last item to process
+ * @param d_buffer Output character buffer to the first item to write
+ * @param options Settings for controlling string processing behavior
+ * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
+ * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
+ * parsing succeeded, item was parsed to null, or failed.
+ */
+template <typename in_iterator_t>
+__device__ __forceinline__ data_casting_result_info
+process_string(in_iterator_t in_begin,
+               in_iterator_t in_end,
+               char* d_buffer,
+               cudf::io::parse_options_view const& options)
+{
+  int32_t bytes           = 0;
+  auto const num_in_chars = thrust::distance(in_begin, in_end);
+  // String values are indicated by keeping the quote character
+  bool const is_string_value =
+    num_in_chars >= 2LL &&
+    (options.quotechar == '\0' ||
+     (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+
+  // Copy literal/numeric value
+  if (not is_string_value) {
+    bytes += (in_end - in_begin);
+    if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer);
+    return {bytes, data_casting_result::PARSING_SUCCESS};
+  }
+  char constexpr backslash_char = '\\';
+
+  // Escape-flag, set after encountering a backslash character
+  bool is_prev_char_escape = false;
+
+  // Exclude beginning and ending quote chars from string range
+  if (!options.keepquotes) {
+    ++in_begin;
+    --in_end;
+  }
+
+  // Iterate over the input
+  while (in_begin != in_end) {
+    // Copy single character to output
+    if (!is_prev_char_escape) {
+      is_prev_char_escape = (*in_begin == backslash_char);
+      if (!is_prev_char_escape) {
+        if (d_buffer) *d_buffer++ = *in_begin;
+        ++bytes;
+      }
+      ++in_begin;
+      continue;
+    }
+
+    // Previous char indicated beginning of escape sequence
+    // Reset escape flag for next loop iteration
+    is_prev_char_escape = false;
+
+    // Check the character that is supposed to be escaped
+    auto escaped_char = get_escape_char(*in_begin);
+
+    // We escaped an invalid escape character -> "fail"/null for this item
+    if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+
+    // Regular, single-character escape
+    if (escaped_char != UNICODE_SEQ) {
+      if (d_buffer) *d_buffer++ = escaped_char;
+      ++bytes;
+      ++in_begin;
+      continue;
+    }
+
+    // This is an escape sequence of a unicode code point: \uXXXX,
+    // where each X in XXXX represents a hex digit
+    // Skip over the 'u' char from \uXXXX to the first hex digit
+    ++in_begin;
+
+    // Make sure that there's at least 4 characters left from the
+    // input, which are expected to be hex digits
+    if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
+      return {bytes, data_casting_result::PARSING_FAILURE};
+    }
+
+    auto hex_val = parse_unicode_hex(in_begin);
+
+    // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
+    if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+
+    // Skip over the four hex digits
+    thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
+
+    // If this may be a UTF-16 encoded surrogate pair:
+    // we expect another \uXXXX sequence
+    int32_t hex_low_val = 0;
+    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+        thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
+        *in_begin == backslash_char && *thrust::next(in_begin) == 'u') {
+      // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low
+      // surrogate
+      hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2));
+    }
+
+    // This is indeed a UTF16 surrogate pair
+    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+        hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+      // Skip over the second \uXXXX sequence
+      thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS);
+
+      // Compute UTF16-encoded code point
+      uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                    (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+      auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
+    } else {
+      // Just a single \uXXXX sequence
+      auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
+    }
+  }
+
+  // The last character of the input is a backslash -> "fail"/null for this item
+  if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+  return {bytes, data_casting_result::PARSING_SUCCESS};
+}
+
+/**
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * warp.
+ *
+ * @tparam num_warps number of warps in the block
+ */
+template <unsigned num_warps>
+struct bitfield_warp {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
+  // 5+32 for each warp.
+  bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size];
+
+  /// Sets all bits to 0
+  __device__ void reset(unsigned warp_id)
+  {
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) {
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0;
+    }
+    is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0;
+  }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK)
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] =
+        is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size];
+    __syncwarp();
+  }
+
+  /// Each thread in a warp sets its own bit.
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] =
+      is_escaping_backslash;
+    __syncwarp();
+  }
+
+  /// Each thread in a warp gets the requested bit.
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index];
+  }
+};
+
+/**
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * block.
+ *
+ * @tparam num_warps number of warps in the block
+ */
+template <unsigned num_warps>
+struct bitfield_block {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
+  // 5 + num_warps*32 for entire block
+  bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size];
+
+  /// Sets all bits to 0
+  __device__ void reset(unsigned warp_id)
+  {
+    if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; }
+    is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0;
+  }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x < UNICODE_LOOK_BACK)
+      is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x];
+    __syncthreads();
+  }
+
+  /// Each thread in a block sets its own bit.
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash;
+    __syncthreads();
+  }
+
+  /// Each thread in a block gets the requested bit.
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[UNICODE_LOOK_BACK + bit_index];
+  }
+};
+
+// Algorithm: warp/block parallel version of string_parse and process_string()
+// Decoding character classes (u8, u16, \*, *):
+// character      count: input->output
+// \uXXXX         6->2/3/4
+// \uXXXX\uXXXX  12->2/3/4
+// \"             2->1
+// *              1->1
+//
+// ERROR conditions. (all collaborating threads quit)
+// c=='\' & curr_idx == end_idx-1;
+// [c-1]=='\' &  get_escape[c]==NEC
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
+//
+// skip conditions. (current thread skips this char, no output)
+// c=='\' skip. (Escaping char only)
+// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
+//
+// write conditions. (write to d_buffer)
+// [c-1]!='\' &  [c]!='\' write [c]
+// [c-1]!='\' &  [c]=='\' skip (already covered in skip conditions)
+// [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
+// [c+1:4]=curr_hex_val
+//        // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c+7:4]=next_hex_val
+//        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
+//        // if prev_hex_val in high, curr_hex_val in low, skip.
+//        // if curr_hex_val in high, next_hex_val in low, write [u16]
+// if curr_hex_val not in high, write [u8]
+// before writing, find num of output characters per threads,
+// then do intra-warp/intra-block scan for out_idx
+// propagate offset from next iteration to carry forward.
+// Uses 1 warp per string or 1 block per string
+
+/**
+ * @brief Warp/Block parallel version of string_parse functor
+ *
+ * @tparam is_warp True if 1 warp per string, False if 1 block per string
+ * @tparam num_warps Number of warps per block
+ * @tparam str_tuple_it Iterator type for tuple with string pointer and its length
+ * @param str_tuples iterator of tuple with string pointer and its length
+ * @param total_out_strings Number of string rows to be processed
+ * @param str_counter Counter to keep track of processed number of strings
+ * @param null_mask Null mask
+ * @param null_count_data pointer to store null count
+ * @param options Settings for controlling string processing behavior
+ * @param d_offsets Offsets to identify where to store the results for each string
+ * @param d_chars Character array to store the characters of strings
+ */
+template <bool is_warp, size_type num_warps, typename str_tuple_it>
+__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                         size_type total_out_strings,
+                                         size_type* str_counter,
+                                         bitmask_type* null_mask,
+                                         size_type* null_count_data,
+                                         cudf::io::parse_options_view const options,
+                                         size_type* d_offsets,
+                                         char* d_chars)
+{
+  constexpr auto BLOCK_SIZE =
+    is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
+  size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x;
+
+  // get 1-string index per warp/block
+  auto get_next_string = [&]() {
+    if constexpr (is_warp) {
+      size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      return __shfl_sync(0xffffffff, istring, 0);
+    } else {
+      // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's
+      // istring value
+      __syncthreads();
+      __shared__ size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      __syncthreads();
+      return istring;
+    }
+  };
+  // grid-stride loop.
+  for (size_type istring = get_next_string(); istring < total_out_strings;
+       istring           = get_next_string()) {
+    // skip nulls
+    if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
+      if (!d_chars && lane == 0) d_offsets[istring] = 0;
+      continue;  // gride-stride return;
+    }
+
+    auto in_begin           = str_tuples[istring].first;
+    auto in_end             = in_begin + str_tuples[istring].second;
+    auto const num_in_chars = str_tuples[istring].second;
+    if constexpr (is_warp) {
+      if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue;
+    } else {
+      if (num_in_chars <= WARP_THRESHOLD) continue;
+    }
+
+    // Check if the value corresponds to the null literal
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        if (lane == 0) {
+          clear_bit(null_mask, istring);
+          atomicAdd(null_count_data, 1);
+          if (!d_chars) d_offsets[istring] = 0;
+        }
+        continue;  // gride-stride return;
+      }
+    }
+    // String values are indicated by keeping the quote character
+    bool const is_string_value =
+      num_in_chars >= 2LL &&
+      (options.quotechar == '\0' ||
+       (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+    char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr;
+
+    // Copy literal/numeric value
+    if (not is_string_value) {
+      if (!d_chars) {
+        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
+      } else {
+        for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
+             char_index += BLOCK_SIZE) {
+          d_buffer[char_index] = in_begin[char_index];
+        }
+      }
+      continue;  // gride-stride return;
+    }
+
+    // Exclude beginning and ending quote chars from string range
+    if (!options.keepquotes) {
+      ++in_begin;
+      --in_end;
+    }
+    // warp-parallelized or block-parallelized process_string()
+
+    auto is_hex = [](auto ch) {
+      return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
+    };
+
+    // for backslash scan calculation: is_previous_escaping_backslash
+    [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size;
+    bool init_state_reg;
+    __shared__ bool init_state_shared;
+    size_type last_offset_reg;
+    __shared__ size_type last_offset_shared;
+    bool& init_state(is_warp ? init_state_reg : init_state_shared);
+    size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared);
+    if (is_warp || lane == 0) {
+      init_state  = false;
+      last_offset = 0;
+    }
+    using bitfield =
+      std::conditional_t<is_warp, bitfield_warp<num_warps>, bitfield_block<num_warps>>;
+    __shared__ bitfield is_slash;
+    is_slash.reset(warp_id);
+    __syncthreads();
+    // 0-31, 32-63, ... i*32-n.
+    // entire warp executes but with mask.
+    for (thread_index_type char_index = lane;
+         char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
+         char_index += BLOCK_SIZE) {
+      bool const is_within_bounds = char_index < (in_end - in_begin);
+      auto const MASK   = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff;
+      auto const c      = is_within_bounds ? in_begin[char_index] : '\0';
+      auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
+      auto const escaped_char = get_escape_char(c);
+
+      bool is_escaping_backslash{false};
+      [[maybe_unused]] bool is_prev_escaping_backslash{false};
+      // To check current is backslash by checking if previous is backslash.
+      // curr = !prev & c=='\\'
+      // So, scan is required from beginning of string.
+      // State table approach (intra-warp FST) (intra-block FST)
+      // 2 states: Not-Slash(NS), Slash(S).
+      // prev  /   *
+      // NS    S  NS
+      //  S   NS  NS
+      // After inclusive scan, all current S states translate to escaping backslash.
+      // All escaping backslash should be skipped.
+
+      struct state_table {
+        // using bit fields instead of state[2]
+        bool state0 : 1;
+        bool state1 : 1;
+        bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; }
+      };
+      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
+      auto composite_op = [](state_table op1, state_table op2) {
+        // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+        return state_table{op1.state0 ? op2.state1 : op2.state0,
+                           op1.state1 ? op2.state1 : op2.state0};
+      };
+      state_table scanned;
+      // inclusive scan of escaping backslashes
+      if constexpr (is_warp) {
+        using SlashScan = cub::WarpScan<state_table>;
+        __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
+        SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.get(init_state);
+        init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        __syncwarp();
+        is_slash.shift(warp_id);
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
+      } else {
+        using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
+        __shared__ typename SlashScan::TempStorage temp_slash;
+        SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.get(init_state);
+        __syncthreads();
+        if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
+        __syncthreads();
+        is_slash.shift(warp_id);
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
+        // There is another __syncthreads() at the end of for-loop.
+      }
+
+      // String with parsing errors are made as null
+      bool error = false;
+      if (is_within_bounds) {
+        // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT
+        // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex
+        error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
+        error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR);
+        error |= (is_prev_escaping_backslash && c == 'u' &&
+                  ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
+                   !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
+                   !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      }
+      // Make sure all threads have no errors before continuing
+      if constexpr (is_warp) {
+        error = __any_sync(MASK, error);
+      } else {
+        using ErrorReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
+        __shared__ typename ErrorReduce::TempStorage temp_storage_error;
+        __shared__ bool error_reduced;
+        error_reduced = ErrorReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
+        // only valid in thread0, so shared memory is used for broadcast.
+        __syncthreads();
+        error = error_reduced;
+      }
+      // If any thread has an error, skip the rest of the string and make this string as null
+      if (error) {
+        if (!d_chars && lane == 0) {
+          if (null_mask != nullptr) {
+            clear_bit(null_mask, istring);
+            atomicAdd(null_count_data, 1);
+          }
+          last_offset        = 0;
+          d_offsets[istring] = 0;
+        }
+        if constexpr (!is_warp) { __syncthreads(); }
+        break;  // gride-stride return;
+      }
+
+      // Skipping non-copied escaped characters
+      bool skip = !is_within_bounds;  // false;
+      // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX
+      skip |= is_escaping_backslash;
+      if (is_within_bounds) {
+        // skip X for each X in \uXXXX
+        skip |=
+          char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u';
+        skip |=
+          char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u';
+        skip |=
+          char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u';
+        skip |=
+          char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u';
+      }
+      int this_num_out = 0;
+      cudf::char_utf8 write_char{};
+
+      if (!skip) {
+        // 1. Unescaped character
+        if (!is_prev_escaping_backslash) {
+          this_num_out = 1;
+          // writes char directly for non-unicode
+        } else {
+          // 2. Escaped character
+          if (escaped_char != UNICODE_SEQ) {
+            this_num_out = 1;
+            // writes char directly for non-unicode
+          } else {
+            // 3. Unicode
+            // UTF8 \uXXXX
+            auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
+            auto hex_low_val = 0;
+            // UTF16 \uXXXX\uXXXX
+            // Note: no need for scanned_backslash below because we already know that
+            // only '\u' check is enough.
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
+                  in_end &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') {
+              hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
+            }
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+              // Compute UTF16-encoded code point
+              uint32_t unicode_code_point = 0x10000 +
+                                            ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                            (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+              write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
+              this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+            } else {
+              // if hex_val is high surrogate, ideally it should be parsing failure.
+              // but skipping it as other parsers do this too.
+              if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+                // Ideally this should be skipped if previous char is high surrogate.
+                skip         = true;
+                this_num_out = 0;
+                write_char   = 0;
+              } else {
+                // if UTF8
+                write_char   = strings::detail::codepoint_to_utf8(hex_val);
+                this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+              }
+            }
+          }
+        }
+      }  // !skip end.
+      {
+        // compute offset to write output for each thread
+        size_type offset;
+        if constexpr (is_warp) {
+          using OffsetScan = cub::WarpScan<size_type>;
+          __shared__ typename OffsetScan::TempStorage temp_storage[num_warps];
+          OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
+        } else {
+          using OffsetScan = cub::BlockScan<size_type, BLOCK_SIZE>;
+          __shared__ typename OffsetScan::TempStorage temp_storage;
+          OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset);
+          __syncthreads();
+        }
+        offset += last_offset;
+        // Write output
+        if (d_chars && !skip) {
+          auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ;
+          if (is_not_unicode) {
+            *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char;
+          } else {
+            strings::detail::from_char_utf8(write_char, d_buffer + offset);
+          }
+        }
+        offset += this_num_out;
+        if constexpr (is_warp) {
+          last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
+        } else {
+          __syncthreads();
+          if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
+          __syncthreads();
+        }
+      }
+    }  // char for-loop
+    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
+  }  // grid-stride for-loop
+}
+
+template <typename str_tuple_it>
+struct string_parse {
+  str_tuple_it str_tuples;
+  bitmask_type* null_mask;
+  size_type* null_count_data;
+  cudf::io::parse_options_view const options;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx)
+  {
+    if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const in_begin     = str_tuples[idx].first;
+    auto const in_end       = in_begin + str_tuples[idx].second;
+    auto const num_in_chars = str_tuples[idx].second;
+
+    if (num_in_chars > SINGLE_THREAD_THRESHOLD) return;
+
+    // Check if the value corresponds to the null literal
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        clear_bit(null_mask, idx);
+        atomicAdd(null_count_data, 1);
+        if (!d_chars) d_offsets[idx] = 0;
+        return;
+      }
+    }
+
+    char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+    if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
+      if (null_mask != nullptr) {
+        clear_bit(null_mask, idx);
+        atomicAdd(null_count_data, 1);
+      }
+      if (!d_chars) d_offsets[idx] = 0;
+    } else {
+      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
+    }
+  }
+};
+
+template <typename SymbolT>
+struct to_string_view_pair {
+  SymbolT const* data;
+  to_string_view_pair(SymbolT const* _data) : data(_data) {}
+  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  {
+    return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
+                                                  static_cast<std::size_t>(thrust::get<1>(ip))};
+  }
+};
+
+template <typename string_view_pair_it>
+static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
+                                            size_type col_size,
+                                            rmm::device_buffer&& null_mask,
+                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::io::parse_options_view const& options,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  //  CUDF_FUNC_RANGE();
+
+  auto const max_length = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    str_tuples,
+    str_tuples + col_size,
+    [] __device__(auto t) { return t.second; },
+    size_type{0},
+    thrust::maximum<size_type>{});
+
+  auto offsets = cudf::make_numeric_column(
+    data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets       = offsets->mutable_view().data<size_type>();
+  auto null_count_data = d_null_count.data();
+
+  auto single_thread_fn = string_parse<decltype(str_tuples)>{
+    str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  constexpr auto warps_per_block  = 8;
+  constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
+  auto num_blocks                 = cudf::util::div_rounding_up_safe(col_size, warps_per_block);
+  auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
+
+  // TODO run these independent kernels in parallel streams.
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  // CHARS column
+  std::unique_ptr<column> chars =
+    strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  auto d_chars = chars->mutable_view().data<char>();
+
+  single_thread_fn.d_chars = d_chars;
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  return make_strings_column(col_size,
+                             std::move(offsets),
+                             std::move(chars),
+                             d_null_count.value(stream),
+                             std::move(null_mask));
+}
+
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  if (col_size == 0) { return make_empty_column(col_type); }
+  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
+  auto null_count_data = d_null_count.data();
+
+  // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
+  auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
+
+  if (col_type == cudf::data_type{cudf::type_id::STRING}) {
+    return parse_string(str_tuples,
+                        col_size,
+                        std::forward<rmm::device_buffer>(null_mask),
+                        d_null_count,
+                        options,
+                        stream,
+                        mr);
+  }
+
+  auto out_col =
+    make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
+  auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);
+
+  // use `ConvertFunctor` to convert non-string values
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    col_size,
+    [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
+      size_type row) {
+      if (col.is_null(row)) { return; }
+      auto const in = str_tuples[row];
+
+      auto const is_null_literal =
+        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
+
+      if (is_null_literal) {
+        col.set_null(row);
+        atomicAdd(null_count_data, 1);
+        return;
+      }
+
+      // If this is a string value, remove quotes
+      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
+
+      auto const is_parsed = cudf::type_dispatcher(col_type,
+                                                   ConvertFunctor{},
+                                                   in_begin,
+                                                   in_end,
+                                                   col.data<char>(),
+                                                   row,
+                                                   col_type,
+                                                   options,
+                                                   false);
+      if (not is_parsed) {
+        col.set_null(row);
+        atomicAdd(null_count_data, 1);
+      }
+    });
+
+  out_col->set_null_count(d_null_count.value(stream));
+
+  return out_col;
+}
+
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 5c3af588411..43d62fcd513 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,6 +116,28 @@ struct parse_options {
   }
 };
 
+/**
+ * @brief Returns the escaped characters for a given character.
+ *
+ * @param escaped_char The character to escape.
+ * @return The escaped characters for a given character.
+ */
+__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return {'\\', '"'};
+    case '\\': return {'\\', '\\'};
+    case '/': return {'\\', '/'};
+    case '\b': return {'\\', 'b'};
+    case '\f': return {'\\', 'f'};
+    case '\n': return {'\\', 'n'};
+    case '\r': return {'\\', 'r'};
+    case '\t': return {'\\', 't'};
+    // case 'u': return UNICODE_SEQ;
+    default: return {'\0', escaped_char};
+  }
+}
+
 /**
  * @brief Returns the numeric value of an ASCII/UTF-8 character.
  * Handles hexadecimal digits, both uppercase and lowercase
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
new file mode 100644
index 00000000000..12fc0a5b2e7
--- /dev/null
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <io/utilities/parsing_utils.cuh>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+namespace cudf::io {
+namespace detail {
+
+/**
+ * @brief Infers data type for a given JSON string input `data`.
+ *
+ * @throw cudf::logic_error if input size is 0
+ * @throw cudf::logic_error if date time is not inferred as string
+ * @throw cudf::logic_error if data type inference failed
+ *
+ * @param options View of inference options
+ * @param data JSON string input
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param size Size of the string input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The inferred data type
+ */
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream);
+}  // namespace detail
+
+namespace json::detail {
+
+/**
+ * @brief Parses the data from an iterator of string views, casting it to the given target data type
+ *
+ * @param data string input base pointer
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param col_size The total number of items of this column
+ * @param col_type The column's target data type
+ * @param null_mask A null mask that renders certain items from the input invalid
+ * @param options Settings for controlling the processing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr The resource to be used for device memory allocation
+ * @return The column that contains the parsed data
+ */
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+}  // namespace json::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu
similarity index 84%
rename from cpp/src/io/utilities/type_inference.cuh
rename to cpp/src/io/utilities/type_inference.cu
index a9ccc80ca33..79a5c8f1c4c 100644
--- a/cpp/src/io/utilities/type_inference.cuh
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,23 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include <io/utilities/column_type_histogram.hpp>
-#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <thrust/distance.h>
-#include <thrust/tuple.h>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
@@ -114,14 +107,14 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  *
  * @param[in] options View of inference options
  * @param[in] data JSON string input
- * @param[in] column_strings_begin The beginning of an offset-length tuple sequence
+ * @param[in] offset_length_begin The beginning of an offset-length tuple sequence
  * @param[in] size Size of the string input
  * @param[out] column_info Histogram of column type counters
  */
 template <int BlockSize, typename OptionsView, typename ColumnStringIter>
 __global__ void infer_column_type_kernel(OptionsView options,
                                          device_span<char const> data,
-                                         ColumnStringIter column_strings_begin,
+                                         ColumnStringIter offset_length_begin,
                                          std::size_t size,
                                          cudf::io::column_type_histogram* column_info)
 {
@@ -129,8 +122,8 @@ __global__ void infer_column_type_kernel(OptionsView options,
 
   for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
-    auto const field_offset = thrust::get<0>(*(column_strings_begin + idx));
-    auto const field_len    = thrust::get<1>(*(column_strings_begin + idx));
+    auto const field_offset = thrust::get<0>(*(offset_length_begin + idx));
+    auto const field_len    = thrust::get<1>(*(offset_length_begin + idx));
     auto const field_begin  = data.begin() + field_offset;
 
     if (cudf::detail::serialized_trie_contains(
@@ -234,7 +227,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
  *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A histogram containing column-specific type counters
@@ -242,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
 template <typename OptionsView, typename ColumnStringIter>
 cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
                                                   cudf::device_span<char const> data,
-                                                  ColumnStringIter column_strings_begin,
+                                                  ColumnStringIter offset_length_begin,
                                                   std::size_t const size,
                                                   rmm::cuda_stream_view stream)
 {
@@ -254,40 +247,22 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
   infer_column_type_kernel<block_size><<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, column_strings_begin, size, d_column_info.data());
+    options, data, offset_length_begin, size, d_column_info.data());
 
   return d_column_info.value(stream);
 }
 
-/**
- * @brief Infers data type for a given JSON string input `data`.
- *
- * @throw cudf::logic_error if input size is 0
- * @throw cudf::logic_error if date time is not inferred as string
- * @throw cudf::logic_error if data type inference failed
- *
- * @tparam OptionsView Type of inference options view
- * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to
- * `thrust::tuple<device_span, string_view>`
- *
- * @param options View of inference options
- * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
- * @param size Size of the string input
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return The inferred data type
- */
-template <typename OptionsView, typename ColumnStringIter>
-cudf::data_type infer_data_type(OptionsView const& options,
-                                device_span<char const> data,
-                                ColumnStringIter column_strings_begin,
-                                std::size_t const size,
-                                rmm::cuda_stream_view stream)
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(size != 0, "No data available for data type inference.\n");
 
-  auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream);
+  auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream);
 
   auto get_type_id = [&](auto const& cinfo) {
     auto int_count_total =
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 220f1a3391f..7c911ac2e04 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -1370,6 +1371,124 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view());
 }
 
+TEST_F(JsonReaderTest, JsonLongString)
+{
+  // Unicode
+  // 0000-FFFF     Basic Multilingual Plane
+  // 10000-10FFFF  Supplementary Plane
+  cudf::test::strings_column_wrapper col1{
+    {
+      "\"\\/\b\f\n\r\t",
+      "\"",
+      "\\",
+      "/",
+      "\b",
+      "\f\n",
+      "\r\t",
+      "$€",
+      "ராபிட்ஸ்",
+      "C𝞵𝓓𝒻",
+      "",  // null
+      "",  // null
+      "கார்த்தி",
+      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+      "𰾑𱔈𲍉",                                          // 30000-3FFFF
+      R"("$€ \u0024\u20ac \\u0024\\u20ac  \\\u0024\\\u20ac \\\\u0024\\\\u20ac)",
+      R"(        \\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"(                      \\abcd)",
+      R"(                 \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+      R"(                \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+    },
+    cudf::test::iterators::nulls_at({10, 11})};
+
+  cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3},
+    cudf::test::iterators::no_nulls()};
+  auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
+  auto col2   = d_col2->view();
+  cudf::table_view const tbl_view{{col1, col2, repeat_times}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(true)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+
+  cudf::table_view const expected = tbl_view;
+  std::map<std::string, data_type> types;
+  types["col1"]  = data_type{type_id::STRING};
+  types["col2"]  = data_type{type_id::STRING};
+  types["int16"] = data_type{type_id::INT16};
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
+      .lines(true)
+      .dtypes(types);
+
+  // Read test data via nested JSON reader
+  auto const table = cudf::io::read_json(json_lines_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
+}
+
+TEST_F(JsonReaderTest, ErrorStrings)
+{
+  // cases of invalid escape characters, invalid unicode encodings.
+  // Error strings will decode to nulls
+  auto const buffer = std::string{R"(
+    {"col0": "\"\a"}
+    {"col0": "\u"}
+    {"col0": "\u0"}
+    {"col0": "\u0b"}
+    {"col0": "\u00b"}
+    {"col0": "\u00bz"}
+    {"col0": "\t34567890123456\t9012345678901\ug0bc"}
+    {"col0": "\t34567890123456\t90123456789012\u0hbc"}
+    {"col0": "\t34567890123456\t90123456789012\u00ic"}
+    {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"}
+)"};
+  // Last one is not an error case, but shows that unicode in json is copied string column output.
+
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .dtypes({data_type{cudf::type_id::STRING}})
+      .lines(true)
+      .legacy(false);
+
+  auto const result      = cudf::io::read_json(in_opts);
+  auto const result_view = result.tbl->view().column(0);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
+  EXPECT_EQ(result_view.null_count(), 9);
+  cudf::test::strings_column_wrapper expected{
+    {"",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4"
+     "\xe0\xae\xbf"},
+    // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d
+    //             0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf
+    cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
+}
+
 TEST_F(JsonReaderTest, TokenAllocation)
 {
   std::array<std::string const, 3> const json_inputs{
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 5c32131114d..9eb5e8f5230 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -21,15 +21,20 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <io/utilities/string_parsing.hpp>
+
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+#include <iterator>
 #include <type_traits>
 
 using namespace cudf::test::iterators;
@@ -37,13 +42,27 @@ using namespace cudf::test::iterators;
 struct JSONTypeCastTest : public cudf::test::BaseFixture {};
 
 namespace {
-struct to_thrust_pair_fn {
-  __device__ thrust::pair<char const*, cudf::size_type> operator()(
-    thrust::pair<cudf::string_view, bool> const& p)
+struct offsets_to_length {
+  __device__ cudf::size_type operator()(thrust::tuple<cudf::size_type, cudf::size_type> const& p)
   {
-    return {p.first.data(), p.first.size_bytes()};
+    return thrust::get<1>(p) - thrust::get<0>(p);
   }
 };
+
+/// Returns length of each string in the column
+auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream)
+{
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
+  return svs_length;
+}
 }  // namespace
 
 auto default_json_options()
@@ -67,26 +86,23 @@ TEST_F(JSONTypeCastTest, String)
   std::vector<char const*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
   cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids);
 
-  auto d_column = cudf::column_device_view::create(input);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(input);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto str_col = cudf::io::json::detail::parse_data(svs.data(),
-                                                    svs.size(),
-                                                    type,
-                                                    std::move(null_mask),
-                                                    0,
-                                                    default_json_options().view(),
-                                                    stream,
-                                                    mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto str_col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto out_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
@@ -103,26 +119,23 @@ TEST_F(JSONTypeCastTest, Int)
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
@@ -146,26 +159,23 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     R"("escape with nothing to escape \")",
     R"("\"\\\/\b\f\n\r\t")",
   });
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected = cudf::test::strings_column_wrapper{
     {"🚀", "Ａ🚀ＡＡ", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},
@@ -173,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected);
 }
 
+TEST_F(JSONTypeCastTest, ErrorNulls)
+{
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto const type   = cudf::data_type{cudf::type_id::STRING};
+
+  // error in decoding
+  std::vector<char const*> input_values{R"("\"\a")",
+                                        R"("\u")",
+                                        R"("\u0")",
+                                        R"("\u0b")",
+                                        R"("\u00b")",
+                                        R"("\u00bz")",
+                                        R"("\t34567890123456\t9012345678901\ug0bc")",
+                                        R"("\t34567890123456\t90123456789012\u0hbc")",
+                                        R"("\t34567890123456\t90123456789012\u00ic")",
+                                        R"("\t34567890123456\t9012345678901\")",
+                                        R"("\t34567890123456\t90123456789012\")",
+                                        R"(null)"};
+  // Note: without quotes are copied without decoding
+  cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end());
+
+  auto column        = cudf::strings_column_view(input);
+  auto space_length  = 128;
+  auto prepend_space = [&space_length](auto const& s) {
+    if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1);
+    return std::string(s);
+  };
+  std::vector<std::string> small_input;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space);
+  cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end());
+
+  std::vector<std::string> large_input;
+  space_length = 128 * 128;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space);
+  cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end());
+
+  std::vector<char const*> expected_values{"", "", "", "", "", "", "", "", "", "", "", ""};
+  cudf::test::strings_column_wrapper expected(
+    expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls());
+
+  // single threads, warp, block.
+  for (auto const& column :
+       {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) {
+    rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
+
+    auto null_mask_it = no_nulls();
+    auto null_mask =
+      std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+    auto str_col = cudf::io::json::detail::parse_data(
+      column.chars().data<char>(),
+      thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+      column.size(),
+      type,
+      std::move(null_mask),
+      0,
+      default_json_options().view(),
+      stream,
+      mr);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index b2eb1b94f9c..a14e7ecf5b3 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
-#include <io/utilities/type_inference.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
-  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 4, 7};
+  auto const string_length   = std::vector<cudf::size_type>{2, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -83,8 +83,8 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
-  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 4};
+  auto const string_length   = std::vector<cudf::size_type>{0, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
-  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 1};
+  auto const string_length   = std::vector<cudf::size_type>{0, 0, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -149,8 +149,8 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
-  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 8, 12};
+  auto const string_length   = std::vector<cudf::size_type>{6, 3, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
-  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 6, 12};
+  auto const string_length   = std::vector<cudf::size_type>{4, 5, 5};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 10};
-  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 10};
+  auto const string_length   = std::vector<cudf::size_type>{8, 9};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<cudf::size_type>{1, 1, 1, 1, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(

From 63d197fe029ff2b57f4e0c7ab975bb35f844fc25 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 19 Sep 2023 19:27:10 -0700
Subject: [PATCH 15/23] Avoid circular cimports in _lib/cpp/reduce.pxd (#14125)

This Cython modules contains some cimports from higher-level modules than it should, which introduces the possibility for circular import issues. Also it contains an unused import of DeviceScalar that can cause similar issues.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14125
---
 python/cudf/cudf/_lib/cpp/reduce.pxd | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd
index 7952c717916..997782dec6c 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/cpp/reduce.pxd
@@ -1,14 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
 
-from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation
+from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport data_type
-from cudf._lib.scalar cimport DeviceScalar
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:

From 2d4f22a9ab0709f808af9253097037e0eb5d00b1 Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:57:26 +0100
Subject: [PATCH 16/23] Implement `GroupBy.value_counts` to match pandas API
 (#14114)

This PR implements `GroupBy.value_counts`, matching the [pandas equivalent](https://pandas.pydata.org/docs/dev/reference/api/pandas.core.groupby.DataFrameGroupBy.value_counts.html) method.

Tests currently ignore the returned Series/DataFrame's name, as this was [added to pandas in v2.0.0](https://github.com/pandas-dev/pandas/commit/bec92a43feb0057f06f4f9b9db26c1a09232b1c0). This can be removed if tests are against `pandas>=2.0.0`.

Closes #12789

Authors:
  - Sam Turner (https://github.com/stmio)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14114
---
 python/cudf/cudf/core/groupby/groupby.py | 164 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   |  67 +++++++++
 2 files changed, 231 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b300c55b537..e1740140b44 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2336,6 +2336,170 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def value_counts(
+        self,
+        subset=None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> DataFrameOrSeries:
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        Parameters
+        ----------
+        subset : list-like, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+        dropna : bool, default True
+            Don't include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series or DataFrame
+            Series if the groupby as_index is True, otherwise DataFrame.
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+        DataFrame.value_counts: Equivalent method on DataFrame.
+        SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
+
+        Notes
+        -----
+        - If the groupby as_index is True then the returned Series will have a
+          MultiIndex with one level per input column.
+        - If the groupby as_index is False then the returned DataFrame will
+          have an additional column with the value_counts. The column is
+          labelled 'count' or 'proportion', depending on the ``normalize``
+          parameter.
+
+        By default, rows that contain any NA values are omitted from
+        the result.
+
+        By default, the result will be in descending order so that the
+        first element of each group is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({
+        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+        ... })
+
+        >>> df
+                gender  education   country
+        0       male    low         US
+        1       male    medium      FR
+        2       female  high        US
+        3       male    low         FR
+        4       female  high        FR
+        5       male    low         FR
+
+        >>> df.groupby('gender').value_counts()
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        FR         2
+                           US         1
+                medium     FR         1
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(ascending=True)
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        US         1
+                medium     FR         1
+                low        FR         2
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(normalize=True)
+        gender  education  country
+        female  high       FR         0.50
+                           US         0.50
+        male    low        FR         0.50
+                           US         0.25
+                medium     FR         0.25
+        Name: proportion, dtype: float64
+
+        >>> df.groupby('gender', as_index=False).value_counts()
+           gender education country  count
+        0  female      high      FR      1
+        1  female      high      US      1
+        2    male       low      FR      2
+        3    male       low      US      1
+        4    male    medium      FR      1
+
+        >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+           gender education country  proportion
+        0  female      high      FR        0.50
+        1  female      high      US        0.50
+        2    male       low      FR        0.50
+        3    male       low      US        0.25
+        4    male    medium      FR        0.25
+        """
+
+        df = cudf.DataFrame.copy(self.obj)
+        groupings = self.grouping.names
+        name = "proportion" if normalize else "count"
+
+        if subset is None:
+            subset = [i for i in df._column_names if i not in groupings]
+        # Check subset exists in dataframe
+        elif set(subset) - set(df._column_names):
+            raise ValueError(
+                f"Keys {set(subset) - set(df._column_names)} in subset "
+                f"do not exist in the DataFrame."
+            )
+        # Catch case where groupby and subset share an element
+        elif set(subset) & set(groupings):
+            raise ValueError(
+                f"Keys {set(subset) & set(groupings)} in subset "
+                "cannot be in the groupby column keys."
+            )
+
+        df["__placeholder"] = 1
+        result = (
+            df.groupby(groupings + list(subset), dropna=dropna)[
+                "__placeholder"
+            ]
+            .count()
+            .sort_index()
+            .astype(np.int64)
+        )
+
+        if normalize:
+            levels = list(range(len(groupings), result.index.nlevels))
+            result /= result.groupby(
+                result.index.droplevel(levels),
+            ).transform("sum")
+
+        if sort:
+            result = result.sort_values(ascending=ascending).sort_index(
+                level=range(len(groupings)), sort_remaining=False
+            )
+
+        if not self._as_index:
+            if name in df._column_names:
+                raise ValueError(
+                    f"Column label '{name}' is duplicate of result column"
+                )
+            result.name = name
+            result = result.to_frame().reset_index()
+        else:
+            result.name = name
+
+        return result
+
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries
     ) -> DataFrameOrSeries:
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 042f0e1aa38..376639d5226 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3473,3 +3473,70 @@ def test_categorical_grouping_pandas_compatibility():
     expected = pdf.groupby("key", sort=False).sum()
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index):
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", np.nan, "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+    expected = pdf.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )
+
+
+def test_group_by_value_counts_subset():
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender").value_counts(["education"])
+    expected = pdf.groupby("gender").value_counts(["education"])
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )
+
+
+def test_group_by_value_counts_clash_with_subset():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["a"])
+
+
+def test_group_by_value_counts_subset_not_exists():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["c"])
+
+
+def test_group_by_value_counts_with_count_column():
+    df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a", as_index=False).value_counts()

From 7b0693f6a5fd58e247a7669a813c6ffba850e4e0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 20 Sep 2023 04:46:35 -1000
Subject: [PATCH 17/23] Fix DataFrame.values with no columns but index (#14134)

Fixes the following

```python
In [32]: cudf.DataFrame(index=range(10)).values
Out[32]: array([], shape=(0, 0), dtype=float64)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14134
---
 python/cudf/cudf/core/frame.py           | 2 +-
 python/cudf/cudf/tests/test_dataframe.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6224793d6f1..1e6d177f8ca 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -437,7 +437,7 @@ def get_column_values_na(col):
         ncol = self._num_columns
         if ncol == 0:
             return make_empty_matrix(
-                shape=(0, 0), dtype=np.dtype("float64"), order="F"
+                shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
             )
 
         if dtype is None:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index cbef9bfa2d8..b69f22ade81 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10374,3 +10374,9 @@ def test_dataframe_init_from_nested_dict():
     pdf = pd.DataFrame(regular_dict)
     gdf = cudf.DataFrame(regular_dict)
     assert_eq(pdf, gdf)
+
+
+def test_data_frame_values_no_cols_but_index():
+    result = cudf.DataFrame(index=range(5)).values
+    expected = pd.DataFrame(index=range(5)).values
+    assert_eq(result, expected)

From f7ca051145d41cf323cfb5a066068cb8b75d3fb3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 20 Sep 2023 10:49:06 -0500
Subject: [PATCH 18/23] Fix type of empty `Index` and raise warning in `Series`
 constructor (#14116)

Fixes: #14091
This PR fixes empty inputs dtype in `Index` to default to `str` instead of `float64`. Another change is there is a deprecation warning for `Series` constructor to match pandas.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14116
---
 python/cudf/cudf/core/algorithms.py       | 21 +++++++----
 python/cudf/cudf/core/dataframe.py        |  2 +-
 python/cudf/cudf/core/index.py            | 12 ++++++-
 python/cudf/cudf/core/series.py           | 32 +++++++++++++++--
 python/cudf/cudf/testing/_utils.py        | 21 +++++++++--
 python/cudf/cudf/tests/test_dataframe.py  | 19 +++++-----
 python/cudf/cudf/tests/test_dropna.py     |  9 +++--
 python/cudf/cudf/tests/test_duplicates.py |  4 +--
 python/cudf/cudf/tests/test_index.py      | 16 ++++++---
 python/cudf/cudf/tests/test_rolling.py    |  9 +++--
 python/cudf/cudf/tests/test_series.py     | 43 ++++++++++++++---------
 python/cudf/cudf/tests/test_stats.py      | 23 ++++++------
 12 files changed, 148 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index a472142ece0..25d58029d6b 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -4,12 +4,13 @@
 import cupy as cp
 import numpy as np
 
+from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, as_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
-from cudf.core.series import Series
 from cudf.options import get_option
+from cudf.utils.dtypes import can_convert_to_column
 
 
 def factorize(
@@ -95,7 +96,13 @@ def factorize(
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
-    values = Series(values)
+    if not can_convert_to_column(values):
+        raise TypeError(
+            "'values' can only be a Series, Index, or CuPy array, "
+            f"got {type(values)}"
+        )
+
+    values = as_column(values)
 
     if na_sentinel is None:
         na_sentinel = (
@@ -128,22 +135,22 @@ def factorize(
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
     if use_na_sentinel is None or use_na_sentinel:
-        cats = values._column.dropna()
+        cats = values.dropna()
     else:
-        cats = values._column
+        cats = values
 
     cats = cats.unique().astype(values.dtype)
 
     if sort:
         cats = cats.sort_values()
 
-    labels = values._column._label_encoding(
+    labels = values._label_encoding(
         cats=cats,
         na_sentinel=Scalar(na_sentinel),
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else as_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 84c16b71997..6e664468644 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5607,7 +5607,7 @@ def quantile(
                 result.name = q
                 return result
 
-        result.index = list(map(float, qs))
+        result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 56ec9ce0359..de8a5948033 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -13,6 +13,7 @@
     List,
     MutableMapping,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
@@ -3467,7 +3468,7 @@ def __new__(
                 "tupleize_cols != True is not yet supported"
             )
 
-        return as_index(
+        res = as_index(
             data,
             copy=copy,
             dtype=dtype,
@@ -3475,6 +3476,15 @@ def __new__(
             nan_as_null=nan_as_null,
             **kwargs,
         )
+        if (
+            isinstance(data, Sequence)
+            and not isinstance(data, range)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            return res.astype("str")
+        return res
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7692d3015f8..a195738af54 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,16 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -500,6 +509,18 @@ def __init__(
         copy=False,
         nan_as_null=True,
     ):
+        if (
+            isinstance(data, Sequence)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            warnings.warn(
+                "The default dtype for empty Series will be 'object' instead "
+                "of 'float64' in a future version. Specify a dtype explicitly "
+                "to silence this warning.",
+                FutureWarning,
+            )
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
@@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
         3     NaN
         dtype: float64
         """
-        return cls(s, nan_as_null=nan_as_null)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            result = cls(s, nan_as_null=nan_as_null)
+        return result
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series(val_counts.index.sort_values(), name=self.name)
+        return Series._from_data(
+            {self.name: val_counts.index.sort_values()}, name=self.name
+        )
 
     @_cudf_nvtx_annotate
     def round(self, decimals=0, how="half_even"):
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e949f7d78e7..9182246826f 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -397,8 +397,12 @@ def assert_column_memory_ne(
     raise AssertionError("lhs and rhs holds the same memory.")
 
 
-def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
-    # Wrapper around pd.Series using a float64 default dtype for empty data.
+def _create_pandas_series_float64_default(
+    data=None, index=None, dtype=None, *args, **kwargs
+):
+    # Wrapper around pd.Series using a float64
+    # default dtype for empty data to silence warnings.
+    # TODO: Remove this in pandas-2.0 upgrade
     if dtype is None and (
         data is None or (not is_scalar(data) and len(data) == 0)
     ):
@@ -406,6 +410,19 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
     return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
 
 
+def _create_cudf_series_float64_default(
+    data=None, index=None, dtype=None, *args, **kwargs
+):
+    # Wrapper around cudf.Series using a float64
+    # default dtype for empty data to silence warnings.
+    # TODO: Remove this in pandas-2.0 upgrade
+    if dtype is None and (
+        data is None or (not is_scalar(data) and len(data) == 0)
+    ):
+        dtype = "float64"
+    return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b69f22ade81..bc85987c612 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -30,6 +30,7 @@
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
+    _create_cudf_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     assert_neq,
@@ -2000,8 +2001,8 @@ def test_series_shape():
 
 
 def test_series_shape_empty():
-    ps = pd.Series(dtype="float64")
-    cs = cudf.Series([])
+    ps = pd.Series([], dtype="float64")
+    cs = cudf.Series([], dtype="float64")
 
     assert ps.shape == cs.shape
 
@@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
 @pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
 def test_series_all_valid_nan(num_elements):
     data = [np.nan] * num_elements
-    sr = cudf.Series(data, nan_as_null=False)
+    sr = _create_cudf_series_float64_default(data, nan_as_null=False)
     np.testing.assert_equal(sr.null_count, 0)
 
 
@@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():
 
 
 def test_as_column_types():
-    col = column.as_column(cudf.Series([]))
+    col = column.as_column(cudf.Series([], dtype="float64"))
     assert_eq(col.dtype, np.dtype("float64"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="float32")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="str")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="object")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="object"))
@@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
 )
 def test_series_values_host_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series_float64_default(data)
 
     np.testing.assert_array_equal(pds.values, gds.values_host)
 
@@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
 )
 def test_series_values_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series_float64_default(data)
     gds_vals = gds.values
     assert isinstance(gds_vals, cupy.ndarray)
     np.testing.assert_array_equal(gds_vals.get(), pds.values)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 3277e52edb3..1def6597706 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -1,11 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import _create_pandas_series, assert_eq
+from cudf.testing._utils import (
+    _create_pandas_series_float64_default,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize(
@@ -22,7 +25,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
 
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
 
     if len(data) > 0:
         if nulls == "one":
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index f77e7b4d775..ddbfdf5eee2 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -10,7 +10,7 @@
 import cudf
 from cudf import concat
 from cudf.testing._utils import (
-    _create_pandas_series,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
 )
@@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset):
     ],
 )
 def test_drop_duplicates_series(data, keep):
-    pds = _create_pandas_series(data)
+    pds = _create_pandas_series_float64_default(data)
     gds = cudf.from_pandas(pds)
 
     assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index b3791cddce3..29232f63e90 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -30,7 +30,8 @@
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_column_memory_eq,
     assert_column_memory_ne,
     assert_eq,
@@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other):
     actual = gd_data.equals(np.array(gd_other))
     assert_eq(expected, actual)
 
-    expected = pd_data.equals(_create_pandas_series(pd_other))
-    actual = gd_data.equals(cudf.Series(gd_other))
+    expected = pd_data.equals(_create_pandas_series_float64_default(pd_other))
+    actual = gd_data.equals(_create_cudf_series_float64_default(gd_other))
     assert_eq(expected, actual)
 
     expected = pd_data.astype("category").equals(pd_other)
@@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
     ],
 )
 def test_isin_index(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.index.isin(values)
@@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
     assert_eq(pidx, gidx)
 
 
+def test_empty_index_init():
+    pidx = pd.Index([])
+    gidx = cudf.Index([])
+
+    assert_eq(pidx, gidx)
+
+
 @pytest.mark.parametrize(
     "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
 )
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index b4e0983a9e3..43fa83e1735 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -9,7 +9,10 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
-from cudf.testing._utils import _create_pandas_series, assert_eq
+from cudf.testing._utils import (
+    _create_pandas_series_float64_default,
+    assert_eq,
+)
 from cudf.testing.dataset_generator import rand_dataframe
 
 
@@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
         elif nulls == "all":
             data = [np.nan] * len(data)
 
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.Series(psr)
     for window_size in range(1, len(data) + 1):
         for min_periods in range(1, window_size + 1):
@@ -313,7 +316,7 @@ def test_rolling_getitem_window():
 @pytest.mark.parametrize("center", [True, False])
 def test_rollling_series_numba_udf_basic(data, index, center):
 
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.from_pandas(psr)
 
     def some_func(A):
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b1e991106ee..cfa571a0f54 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -19,7 +19,8 @@
     NUMERIC_TYPES,
     SERIES_OR_INDEX_NAMES,
     TIMEDELTA_TYPES,
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -400,8 +401,8 @@ def test_series_tolist(data):
     [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57],
 )
 def test_series_size(data):
-    psr = _create_pandas_series(data)
-    gsr = cudf.Series(data)
+    psr = _create_pandas_series_float64_default(data)
+    gsr = _create_cudf_series_float64_default(data)
 
     assert_eq(psr.size, gsr.size)
 
@@ -487,7 +488,7 @@ def test_series_describe_other_types(ps):
 )
 @pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
 def test_series_factorize(data, na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas()
 
     with pytest.warns(FutureWarning):
@@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel):
 )
 @pytest.mark.parametrize("use_na_sentinel", [True, False])
 def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(
@@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
 )
 @pytest.mark.parametrize("sort", [True, False])
 def test_series_factorize_sort(data, sort):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(sort=sort)
@@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize):
             ],
             dtype="datetime64[ns]",
         ),
-        cudf.Series(name="empty series"),
+        cudf.Series(name="empty series", dtype="float64"),
         cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"),
     ],
 )
@@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method():
 
 
 def test_set_index_unequal_length():
-    s = cudf.Series()
+    s = cudf.Series(dtype="float64")
     with pytest.raises(ValueError):
         s.index = [1, 2, 3]
 
@@ -1682,7 +1683,7 @@ def test_series_nunique_index(data):
     ],
 )
 def test_axes(data):
-    csr = cudf.Series(data)
+    csr = _create_cudf_series_float64_default(data)
     psr = csr.to_pandas()
 
     expected = psr.axes
@@ -1760,7 +1761,7 @@ def test_series_truncate_datetimeindex():
 )
 def test_isin_numeric(data, values):
     index = np.random.randint(0, 100, len(data))
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.Series.from_pandas(psr, nan_as_null=False)
 
     expected = psr.isin(values)
@@ -1820,7 +1821,7 @@ def test_fill_new_category():
     ],
 )
 def test_isin_datetime(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1849,7 +1850,7 @@ def test_isin_datetime(data, values):
     ],
 )
 def test_isin_string(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1878,7 +1879,7 @@ def test_isin_string(data, values):
     ],
 )
 def test_isin_categorical(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -2099,7 +2100,7 @@ def test_series_to_dict(into):
     ],
 )
 def test_series_hasnans(data):
-    gs = cudf.Series(data, nan_as_null=False)
+    gs = _create_cudf_series_float64_default(data, nan_as_null=False)
     ps = gs.to_pandas(nullable=True)
 
     assert_eq(gs.hasnans, ps.hasnans)
@@ -2170,8 +2171,8 @@ def test_series_init_dict_with_index(data, index):
     "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
 )
 def test_series_init_scalar_with_index(data, index):
-    pandas_series = _create_pandas_series(data, index=index)
-    cudf_series = cudf.Series(data, index=index)
+    pandas_series = _create_pandas_series_float64_default(data, index=index)
+    cudf_series = _create_cudf_series_float64_default(data, index=index)
 
     assert_eq(
         pandas_series,
@@ -2313,7 +2314,15 @@ def test_series_round_builtin(data, digits):
     assert_eq(expected, actual)
 
 
+def test_series_empty_warning():
+    with pytest.warns(FutureWarning):
+        expected = pd.Series([])
+    with pytest.warns(FutureWarning):
+        actual = cudf.Series([])
+    assert_eq(expected, actual)
+
+
 def test_series_count_invalid_param():
-    s = cudf.Series([])
+    s = cudf.Series([], dtype="float64")
     with pytest.raises(TypeError):
         s.count(skipna=True)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 463cdb8a7f4..3ac605a1a4d 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -10,7 +10,8 @@
 import cudf
 from cudf.datasets import randomdata
 from cudf.testing._utils import (
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -222,8 +223,8 @@ def test_approx_quantiles_int():
 )
 def test_misc_quantiles(data, q):
 
-    pdf_series = _create_pandas_series(data)
-    gdf_series = cudf.Series(data)
+    pdf_series = _create_pandas_series_float64_default(data)
+    gdf_series = _create_cudf_series_float64_default(data)
 
     expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q)
     actual = gdf_series.quantile(q)
@@ -242,7 +243,7 @@ def test_misc_quantiles(data, q):
             [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False
         ),
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -292,7 +293,7 @@ def test_kurt_skew_error(op):
             [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False
         ),
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -348,7 +349,7 @@ def test_series_median(dtype, num_na):
         np.zeros(100),
         np.array([1.123, 2.343, np.nan, 0.0]),
         np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -376,7 +377,7 @@ def test_series_pct_change(data, periods, fill_method):
         np.array([1.123, 2.343, np.nan, 0.0]),
         cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
         cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -420,7 +421,7 @@ def test_cov1d(data1, data2):
         np.array([1.123, 2.343, np.nan, 0.0]),
         cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
         cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -524,14 +525,14 @@ def test_df_corr(method):
 )
 @pytest.mark.parametrize("skipna", [True, False])
 def test_nans_stats(data, ops, skipna):
-    psr = _create_pandas_series(data)
-    gsr = cudf.Series(data, nan_as_null=False)
+    psr = _create_pandas_series_float64_default(data)
+    gsr = _create_cudf_series_float64_default(data, nan_as_null=False)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)
     )
 
-    gsr = cudf.Series(data, nan_as_null=False)
+    gsr = _create_cudf_series_float64_default(data, nan_as_null=False)
     # Since there is no concept of `nan_as_null` in pandas,
     # nulls will be returned in the operations. So only
     # testing for `skipna=True` when `nan_as_null=False`

From eb6d134d169ed077000ee7d075d5363dec066578 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 20 Sep 2023 06:49:14 -1000
Subject: [PATCH 19/23] Don't sort columns for DataFrame init from list of
 Series (#14136)

closes #14132

This PR removes the re-sorting of dataframe columns when initialized by a series list.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14136
---
 python/cudf/cudf/core/dataframe.py       |  4 +---
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6e664468644..1a780cc9e9f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7885,9 +7885,7 @@ def _get_union_of_indices(indexes):
         return indexes[0]
     else:
         merged_index = cudf.core.index.GenericIndex._concat(indexes)
-        merged_index = merged_index.drop_duplicates()
-        inds = merged_index._values.argsort()
-        return merged_index.take(inds)
+        return merged_index.drop_duplicates()
 
 
 def _get_union_of_series_names(series_list):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index bc85987c612..6180162ecdd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -221,6 +221,18 @@ def test_init_unaligned_with_index():
     assert_eq(pdf, gdf, check_dtype=False)
 
 
+def test_init_series_list_columns_unsort():
+    pseries = [
+        pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)
+    ]
+    gseries = [
+        cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)
+    ]
+    pdf = pd.DataFrame(pseries)
+    gdf = cudf.DataFrame(gseries)
+    assert_eq(pdf, gdf)
+
+
 def test_series_basic():
     # Make series from buffer
     a1 = np.arange(10, dtype=np.float64)

From 40d4cc5565f600864c3b16f30d3d26fd4904deaf Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 20 Sep 2023 11:03:44 -0700
Subject: [PATCH 20/23] Refactor parquet thrift reader (#14097)

Refactors the current `CompactProtocolReader` used to parse parquet file metadata. The main goal of the refactor is to allow easier use of `std::optional` fields in the thrift structs to prevent situations as in #14024 where an optional field is an empty string. The writer cannot distinguish between present-but-empty and not-present, so chooses the latter when writing the field. This PR adds a `ParquetFieldOptional` functor that can wrap the other field functors, obviating the need to write a new optional functor for each type.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14097
---
 .../io/parquet/compact_protocol_reader.cpp    | 691 +++++++++++++++---
 .../io/parquet/compact_protocol_reader.hpp    | 586 +--------------
 .../io/parquet/compact_protocol_writer.cpp    |  30 +-
 .../io/parquet/compact_protocol_writer.hpp    |   3 +
 cpp/src/io/parquet/parquet.hpp                |  18 +-
 cpp/src/io/parquet/parquet_common.hpp         |   2 +-
 cpp/src/io/parquet/writer_impl.cu             |  38 +-
 7 files changed, 662 insertions(+), 706 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index ae11af92f78..5c7b8ca3f8c 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -18,27 +18,474 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <functional>
 #include <tuple>
 
 namespace cudf {
 namespace io {
 namespace parquet {
-uint8_t const CompactProtocolReader::g_list2struct[16] = {0,
-                                                          1,
-                                                          2,
-                                                          ST_FLD_BYTE,
-                                                          ST_FLD_DOUBLE,
-                                                          5,
-                                                          ST_FLD_I16,
-                                                          7,
-                                                          ST_FLD_I32,
-                                                          9,
-                                                          ST_FLD_I64,
-                                                          ST_FLD_BINARY,
-                                                          ST_FLD_STRUCT,
-                                                          ST_FLD_MAP,
-                                                          ST_FLD_SET,
-                                                          ST_FLD_LIST};
+
+/**
+ * @brief Base class for parquet field functors.
+ *
+ * Holds the field value used by all of the specialized functors.
+ */
+class parquet_field {
+ private:
+  int _field_val;
+
+ protected:
+  parquet_field(int f) : _field_val(f) {}
+
+ public:
+  virtual ~parquet_field() = default;
+  int field() const { return _field_val; }
+};
+
+/**
+ * @brief Abstract base class for list functors.
+ */
+template <typename T>
+class parquet_field_list : public parquet_field {
+ private:
+  using read_func_type = std::function<bool(uint32_t, CompactProtocolReader*)>;
+  FieldType _expected_type;
+  read_func_type _read_value;
+
+ protected:
+  std::vector<T>& val;
+
+  void bind_read_func(read_func_type fn) { _read_value = fn; }
+
+  parquet_field_list(int f, std::vector<T>& v, FieldType t)
+    : parquet_field(f), _expected_type(t), val(v)
+  {
+  }
+
+ public:
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_LIST) { return true; }
+    auto const [t, n] = cpr->get_listh();
+    if (t != _expected_type) { return true; }
+    val.resize(n);
+    for (uint32_t i = 0; i < n; i++) {
+      if (_read_value(i, cpr)) { return true; }
+    }
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to set value to bool read from CompactProtocolReader
+ *
+ * bool doesn't actually encode a value, we just use the field type to indicate true/false
+ *
+ * @return True if field type is not bool
+ */
+class parquet_field_bool : public parquet_field {
+  bool& val;
+
+ public:
+  parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; }
+    val = field_type == ST_FLD_TRUE;
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to read a vector of booleans from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * bool fails
+ */
+struct parquet_field_bool_list : public parquet_field_list<bool> {
+  parquet_field_bool_list(int f, std::vector<bool>& v) : parquet_field_list(f, v, ST_FLD_TRUE)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const current_byte = cpr->getb();
+      if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; }
+      this->val[i] = current_byte == ST_FLD_TRUE;
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Base type for a functor that reads an integer from CompactProtocolReader
+ *
+ * Assuming signed ints since the parquet spec does not use unsigned ints anywhere.
+ *
+ * @return True if there is a type mismatch
+ */
+template <typename T, int EXPECTED_TYPE>
+class parquet_field_int : public parquet_field {
+  static constexpr bool is_byte = std::is_same_v<T, int8_t>;
+
+  T& val;
+
+ public:
+  parquet_field_int(int f, T& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if constexpr (is_byte) {
+      val = cpr->getb();
+    } else {
+      val = cpr->get_zigzag<T>();
+    }
+    return (field_type != EXPECTED_TYPE);
+  }
+};
+
+using parquet_field_int8  = parquet_field_int<int8_t, ST_FLD_BYTE>;
+using parquet_field_int32 = parquet_field_int<int32_t, ST_FLD_I32>;
+using parquet_field_int64 = parquet_field_int<int64_t, ST_FLD_I64>;
+
+/**
+ * @brief Functor to read a vector of integers from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading an
+ * integer fails
+ */
+template <typename T, FieldType EXPECTED_TYPE>
+struct parquet_field_int_list : public parquet_field_list<T> {
+  parquet_field_int_list(int f, std::vector<T>& v) : parquet_field_list<T>(f, v, EXPECTED_TYPE)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      this->val[i] = cpr->get_zigzag<T>();
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+using parquet_field_int64_list = parquet_field_int_list<int64_t, ST_FLD_I64>;
+
+/**
+ * @brief Functor to read a string from CompactProtocolReader
+ *
+ * @return True if field type mismatches or if size of string exceeds bounds
+ * of the CompactProtocolReader
+ */
+class parquet_field_string : public parquet_field {
+  std::string& val;
+
+ public:
+  parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_BINARY) { return true; }
+    auto const n = cpr->get_u32();
+    if (n < static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+      val.assign(reinterpret_cast<char const*>(cpr->m_cur), n);
+      cpr->m_cur += n;
+      return false;
+    } else {
+      return true;
+    }
+  }
+};
+
+/**
+ * @brief Functor to read a vector of strings from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * string fails
+ */
+struct parquet_field_string_list : public parquet_field_list<std::string> {
+  parquet_field_string_list(int f, std::vector<std::string>& v)
+    : parquet_field_list(f, v, ST_FLD_BINARY)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const l = cpr->get_u32();
+      if (l < static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+        this->val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
+        cpr->m_cur += l;
+      } else {
+        return true;
+      }
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to set value to enum read from CompactProtocolReader
+ *
+ * @return True if field type is not int32
+ */
+template <typename Enum>
+class parquet_field_enum : public parquet_field {
+  Enum& val;
+
+ public:
+  parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    val = static_cast<Enum>(cpr->get_i32());
+    return (field_type != ST_FLD_I32);
+  }
+};
+
+/**
+ * @brief Functor to read a vector of enums from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading an
+ * enum fails
+ */
+template <typename Enum>
+struct parquet_field_enum_list : public parquet_field_list<Enum> {
+  parquet_field_enum_list(int f, std::vector<Enum>& v) : parquet_field_list<Enum>(f, v, ST_FLD_I32)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      this->val[i] = static_cast<Enum>(cpr->get_i32());
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to read a structure from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * struct fails
+ */
+template <typename T>
+class parquet_field_struct : public parquet_field {
+  T& val;
+
+ public:
+  parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    return (field_type != ST_FLD_STRUCT || !(cpr->read(&val)));
+  }
+};
+
+/**
+ * @brief Functor to read optional structures in unions
+ *
+ * @return True if field types mismatch
+ */
+template <typename E, typename T>
+class parquet_field_union_struct : public parquet_field {
+  E& enum_val;
+  thrust::optional<T>& val;  // union structs are always wrapped in std::optional
+
+ public:
+  parquet_field_union_struct(int f, E& ev, thrust::optional<T>& v)
+    : parquet_field(f), enum_val(ev), val(v)
+  {
+  }
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    T v;
+    bool const res = parquet_field_struct<T>(field(), v).operator()(cpr, field_type);
+    if (!res) {
+      val      = v;
+      enum_val = static_cast<E>(field());
+    }
+    return res;
+  }
+};
+
+/**
+ * @brief Functor to read empty structures in unions
+ *
+ * Added to avoid having to define read() functions for empty structs contained in unions.
+ *
+ * @return True if field types mismatch
+ */
+template <typename E>
+class parquet_field_union_enumerator : public parquet_field {
+  E& val;
+
+ public:
+  parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) { return true; }
+    cpr->skip_struct_field(field_type);
+    val = static_cast<E>(field());
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to read a vector of structures from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * struct fails
+ */
+template <typename T>
+struct parquet_field_struct_list : public parquet_field_list<T> {
+  parquet_field_struct_list(int f, std::vector<T>& v) : parquet_field_list<T>(f, v, ST_FLD_STRUCT)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      if (not cpr->read(&this->val[i])) { return true; }
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields
+// in a struct
+/**
+ * @brief Functor to read a union member from CompactProtocolReader
+ *
+ * @tparam is_empty True if tparam `T` type is empty type, else false.
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * union member fails
+ */
+template <typename T, bool is_empty = false>
+class ParquetFieldUnionFunctor : public parquet_field {
+  bool& is_set;
+  T& val;
+
+ public:
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) {
+      return true;
+    } else {
+      is_set = true;
+      return !cpr->read(&val);
+    }
+  }
+};
+
+template <typename T>
+class ParquetFieldUnionFunctor<T, true> : public parquet_field {
+  bool& is_set;
+  T& val;
+
+ public:
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) {
+      return true;
+    } else {
+      is_set = true;
+      cpr->skip_struct_field(field_type);
+      return false;
+    }
+  }
+};
+
+template <typename T>
+ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
+{
+  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
+}
+
+/**
+ * @brief Functor to read a binary from CompactProtocolReader
+ *
+ * @return True if field type mismatches or if size of binary exceeds bounds
+ * of the CompactProtocolReader
+ */
+class parquet_field_binary : public parquet_field {
+  std::vector<uint8_t>& val;
+
+ public:
+  parquet_field_binary(int f, std::vector<uint8_t>& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_BINARY) { return true; }
+    auto const n = cpr->get_u32();
+    if (n <= static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+      val.resize(n);
+      val.assign(cpr->m_cur, cpr->m_cur + n);
+      cpr->m_cur += n;
+      return false;
+    } else {
+      return true;
+    }
+  }
+};
+
+/**
+ * @brief Functor to read a vector of binaries from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * binary fails
+ */
+struct parquet_field_binary_list : public parquet_field_list<std::vector<uint8_t>> {
+  parquet_field_binary_list(int f, std::vector<std::vector<uint8_t>>& v)
+    : parquet_field_list(f, v, ST_FLD_BINARY)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const l = cpr->get_u32();
+      if (l <= static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+        val[i].resize(l);
+        val[i].assign(cpr->m_cur, cpr->m_cur + l);
+        cpr->m_cur += l;
+      } else {
+        return true;
+      }
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to read a struct from CompactProtocolReader
+ *
+ * @return True if field type mismatches
+ */
+class parquet_field_struct_blob : public parquet_field {
+  std::vector<uint8_t>& val;
+
+ public:
+  parquet_field_struct_blob(int f, std::vector<uint8_t>& v) : parquet_field(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) { return true; }
+    uint8_t const* const start = cpr->m_cur;
+    cpr->skip_struct_field(field_type);
+    if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
+    return false;
+  }
+};
+
+/**
+ * @brief functor to wrap functors for optional fields
+ */
+template <typename T, typename FieldFunctor>
+class parquet_field_optional : public parquet_field {
+  thrust::optional<T>& val;
+
+ public:
+  parquet_field_optional(int f, thrust::optional<T>& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    T v;
+    bool const res = FieldFunctor(field(), v).operator()(cpr, field_type);
+    if (!res) { val = v; }
+    return res;
+  }
+};
 
 /**
  * @brief Skips the number of bytes according to the specified struct type
@@ -59,22 +506,21 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth)
     case ST_FLD_BYTE: skip_bytes(1); break;
     case ST_FLD_DOUBLE: skip_bytes(8); break;
     case ST_FLD_BINARY: skip_bytes(get_u32()); break;
-    case ST_FLD_LIST:
+    case ST_FLD_LIST: [[fallthrough]];
     case ST_FLD_SET: {
-      int c = getb();
-      int n = c >> 4;
-      if (n == 0xf) n = get_i32();
-      t = g_list2struct[c & 0xf];
-      if (depth > 10) return false;
-      for (int32_t i = 0; i < n; i++)
+      auto const [t, n] = get_listh();
+      if (depth > 10) { return false; }
+      for (uint32_t i = 0; i < n; i++) {
         skip_struct_field(t, depth + 1);
+      }
     } break;
     case ST_FLD_STRUCT:
       for (;;) {
-        int c = getb();
-        t     = c & 0xf;
-        if (!c) break;
-        if (depth > 10) return false;
+        int const c = getb();
+        t           = c & 0xf;
+        if (c == 0) { break; }               // end of struct
+        if ((c & 0xf0) == 0) { get_i16(); }  // field id is not a delta
+        if (depth > 10) { return false; }
         skip_struct_field(t, depth + 1);
       }
       break;
@@ -125,11 +571,11 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>
   int field           = 0;
   while (true) {
     int const current_byte = cpr->getb();
-    if (!current_byte) break;
-    int const field_delta = current_byte >> 4;
-    int const field_type  = current_byte & 0xf;
-    field                 = field_delta ? field + field_delta : cpr->get_i16();
-    bool exit_function    = FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
+    if (!current_byte) { break; }
+    int const field_delta    = current_byte >> 4;
+    int const field_type     = current_byte & 0xf;
+    field                    = field_delta ? field + field_delta : cpr->get_i16();
+    bool const exit_function = FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
     if (exit_function) { return false; }
   }
   return true;
@@ -137,27 +583,30 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>
 
 bool CompactProtocolReader::read(FileMetaData* f)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, f->version),
-                            ParquetFieldStructList(2, f->schema),
-                            ParquetFieldInt64(3, f->num_rows),
-                            ParquetFieldStructList(4, f->row_groups),
-                            ParquetFieldStructList(5, f->key_value_metadata),
-                            ParquetFieldString(6, f->created_by));
+  using optional_list_column_order =
+    parquet_field_optional<std::vector<ColumnOrder>, parquet_field_struct_list<ColumnOrder>>;
+  auto op = std::make_tuple(parquet_field_int32(1, f->version),
+                            parquet_field_struct_list(2, f->schema),
+                            parquet_field_int64(3, f->num_rows),
+                            parquet_field_struct_list(4, f->row_groups),
+                            parquet_field_struct_list(5, f->key_value_metadata),
+                            parquet_field_string(6, f->created_by),
+                            optional_list_column_order(7, f->column_orders));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(SchemaElement* s)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<Type>(1, s->type),
-                            ParquetFieldInt32(2, s->type_length),
-                            ParquetFieldEnum<FieldRepetitionType>(3, s->repetition_type),
-                            ParquetFieldString(4, s->name),
-                            ParquetFieldInt32(5, s->num_children),
-                            ParquetFieldEnum<ConvertedType>(6, s->converted_type),
-                            ParquetFieldInt32(7, s->decimal_scale),
-                            ParquetFieldInt32(8, s->decimal_precision),
-                            ParquetFieldOptionalInt32(9, s->field_id),
-                            ParquetFieldStruct(10, s->logical_type));
+  auto op = std::make_tuple(parquet_field_enum<Type>(1, s->type),
+                            parquet_field_int32(2, s->type_length),
+                            parquet_field_enum<FieldRepetitionType>(3, s->repetition_type),
+                            parquet_field_string(4, s->name),
+                            parquet_field_int32(5, s->num_children),
+                            parquet_field_enum<ConvertedType>(6, s->converted_type),
+                            parquet_field_int32(7, s->decimal_scale),
+                            parquet_field_int32(8, s->decimal_precision),
+                            parquet_field_optional<int32_t, parquet_field_int32>(9, s->field_id),
+                            parquet_field_struct(10, s->logical_type));
   return function_builder(this, op);
 }
 
@@ -181,21 +630,21 @@ bool CompactProtocolReader::read(LogicalType* l)
 
 bool CompactProtocolReader::read(DecimalType* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision));
+  auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(TimeType* t)
 {
   auto op =
-    std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
+    std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(TimestampType* t)
 {
   auto op =
-    std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
+    std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit));
   return function_builder(this, op);
 }
 
@@ -209,123 +658,129 @@ bool CompactProtocolReader::read(TimeUnit* u)
 
 bool CompactProtocolReader::read(IntType* i)
 {
-  auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned));
+  auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(RowGroup* r)
 {
-  auto op = std::make_tuple(ParquetFieldStructList(1, r->columns),
-                            ParquetFieldInt64(2, r->total_byte_size),
-                            ParquetFieldInt64(3, r->num_rows));
+  auto op = std::make_tuple(parquet_field_struct_list(1, r->columns),
+                            parquet_field_int64(2, r->total_byte_size),
+                            parquet_field_int64(3, r->num_rows));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnChunk* c)
 {
-  auto op = std::make_tuple(ParquetFieldString(1, c->file_path),
-                            ParquetFieldInt64(2, c->file_offset),
-                            ParquetFieldStruct(3, c->meta_data),
-                            ParquetFieldInt64(4, c->offset_index_offset),
-                            ParquetFieldInt32(5, c->offset_index_length),
-                            ParquetFieldInt64(6, c->column_index_offset),
-                            ParquetFieldInt32(7, c->column_index_length));
+  auto op = std::make_tuple(parquet_field_string(1, c->file_path),
+                            parquet_field_int64(2, c->file_offset),
+                            parquet_field_struct(3, c->meta_data),
+                            parquet_field_int64(4, c->offset_index_offset),
+                            parquet_field_int32(5, c->offset_index_length),
+                            parquet_field_int64(6, c->column_index_offset),
+                            parquet_field_int32(7, c->column_index_length));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<Type>(1, c->type),
-                            ParquetFieldEnumList(2, c->encodings),
-                            ParquetFieldStringList(3, c->path_in_schema),
-                            ParquetFieldEnum<Compression>(4, c->codec),
-                            ParquetFieldInt64(5, c->num_values),
-                            ParquetFieldInt64(6, c->total_uncompressed_size),
-                            ParquetFieldInt64(7, c->total_compressed_size),
-                            ParquetFieldInt64(9, c->data_page_offset),
-                            ParquetFieldInt64(10, c->index_page_offset),
-                            ParquetFieldInt64(11, c->dictionary_page_offset),
-                            ParquetFieldStruct(12, c->statistics));
+  auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
+                            parquet_field_enum_list(2, c->encodings),
+                            parquet_field_string_list(3, c->path_in_schema),
+                            parquet_field_enum<Compression>(4, c->codec),
+                            parquet_field_int64(5, c->num_values),
+                            parquet_field_int64(6, c->total_uncompressed_size),
+                            parquet_field_int64(7, c->total_compressed_size),
+                            parquet_field_int64(9, c->data_page_offset),
+                            parquet_field_int64(10, c->index_page_offset),
+                            parquet_field_int64(11, c->dictionary_page_offset),
+                            parquet_field_struct(12, c->statistics));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(PageHeader* p)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<PageType>(1, p->type),
-                            ParquetFieldInt32(2, p->uncompressed_page_size),
-                            ParquetFieldInt32(3, p->compressed_page_size),
-                            ParquetFieldStruct(5, p->data_page_header),
-                            ParquetFieldStruct(7, p->dictionary_page_header),
-                            ParquetFieldStruct(8, p->data_page_header_v2));
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, p->type),
+                            parquet_field_int32(2, p->uncompressed_page_size),
+                            parquet_field_int32(3, p->compressed_page_size),
+                            parquet_field_struct(5, p->data_page_header),
+                            parquet_field_struct(7, p->dictionary_page_header),
+                            parquet_field_struct(8, p->data_page_header_v2));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DataPageHeader* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldEnum<Encoding>(2, d->encoding),
-                            ParquetFieldEnum<Encoding>(3, d->definition_level_encoding),
-                            ParquetFieldEnum<Encoding>(4, d->repetition_level_encoding));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_enum<Encoding>(2, d->encoding),
+                            parquet_field_enum<Encoding>(3, d->definition_level_encoding),
+                            parquet_field_enum<Encoding>(4, d->repetition_level_encoding));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DictionaryPageHeader* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldEnum<Encoding>(2, d->encoding));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_enum<Encoding>(2, d->encoding));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DataPageHeaderV2* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldInt32(2, d->num_nulls),
-                            ParquetFieldInt32(3, d->num_rows),
-                            ParquetFieldEnum<Encoding>(4, d->encoding),
-                            ParquetFieldInt32(5, d->definition_levels_byte_length),
-                            ParquetFieldInt32(6, d->repetition_levels_byte_length),
-                            ParquetFieldBool(7, d->is_compressed));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_int32(2, d->num_nulls),
+                            parquet_field_int32(3, d->num_rows),
+                            parquet_field_enum<Encoding>(4, d->encoding),
+                            parquet_field_int32(5, d->definition_levels_byte_length),
+                            parquet_field_int32(6, d->repetition_levels_byte_length),
+                            parquet_field_bool(7, d->is_compressed));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(KeyValue* k)
 {
-  auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value));
+  auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(PageLocation* p)
 {
-  auto op = std::make_tuple(ParquetFieldInt64(1, p->offset),
-                            ParquetFieldInt32(2, p->compressed_page_size),
-                            ParquetFieldInt64(3, p->first_row_index));
+  auto op = std::make_tuple(parquet_field_int64(1, p->offset),
+                            parquet_field_int32(2, p->compressed_page_size),
+                            parquet_field_int64(3, p->first_row_index));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(OffsetIndex* o)
 {
-  auto op = std::make_tuple(ParquetFieldStructList(1, o->page_locations));
+  auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnIndex* c)
 {
-  auto op = std::make_tuple(ParquetFieldBoolList(1, c->null_pages),
-                            ParquetFieldBinaryList(2, c->min_values),
-                            ParquetFieldBinaryList(3, c->max_values),
-                            ParquetFieldEnum<BoundaryOrder>(4, c->boundary_order),
-                            ParquetFieldInt64List(5, c->null_counts));
+  auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages),
+                            parquet_field_binary_list(2, c->min_values),
+                            parquet_field_binary_list(3, c->max_values),
+                            parquet_field_enum<BoundaryOrder>(4, c->boundary_order),
+                            parquet_field_int64_list(5, c->null_counts));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(Statistics* s)
 {
-  auto op = std::make_tuple(ParquetFieldBinary(1, s->max),
-                            ParquetFieldBinary(2, s->min),
-                            ParquetFieldInt64(3, s->null_count),
-                            ParquetFieldInt64(4, s->distinct_count),
-                            ParquetFieldBinary(5, s->max_value),
-                            ParquetFieldBinary(6, s->min_value));
+  auto op = std::make_tuple(parquet_field_binary(1, s->max),
+                            parquet_field_binary(2, s->min),
+                            parquet_field_int64(3, s->null_count),
+                            parquet_field_int64(4, s->distinct_count),
+                            parquet_field_binary(5, s->max_value),
+                            parquet_field_binary(6, s->min_value));
+  return function_builder(this, op);
+}
+
+bool CompactProtocolReader::read(ColumnOrder* c)
+{
+  auto op = std::make_tuple(parquet_field_union_enumerator<ColumnOrder::Type>(1, c->type));
   return function_builder(this, op);
 }
 
@@ -338,7 +793,7 @@ bool CompactProtocolReader::read(Statistics* s)
  */
 bool CompactProtocolReader::InitSchema(FileMetaData* md)
 {
-  if (static_cast<std::size_t>(WalkSchema(md)) != md->schema.size()) return false;
+  if (static_cast<std::size_t>(WalkSchema(md)) != md->schema.size()) { return false; }
 
   /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a
    * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains
@@ -353,13 +808,15 @@ bool CompactProtocolReader::InitSchema(FileMetaData* md)
       for (auto const& path : column.meta_data.path_in_schema) {
         auto const it = [&] {
           // find_if starting at (current_schema_index + 1) and then wrapping
-          auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; };
-          auto mid    = md->schema.cbegin() + current_schema_index + 1;
-          auto it     = std::find_if(mid, md->schema.cend(), schema);
-          if (it != md->schema.cend()) return it;
+          auto const schema = [&](auto const& e) {
+            return e.parent_idx == parent && e.name == path;
+          };
+          auto const mid = md->schema.cbegin() + current_schema_index + 1;
+          auto const it  = std::find_if(mid, md->schema.cend(), schema);
+          if (it != md->schema.cend()) { return it; }
           return std::find_if(md->schema.cbegin(), mid, schema);
         }();
-        if (it == md->schema.cend()) return false;
+        if (it == md->schema.cend()) { return false; }
         current_schema_index = std::distance(md->schema.cbegin(), it);
         column.schema_idx    = current_schema_index;
         parent               = current_schema_index;
@@ -401,9 +858,9 @@ int CompactProtocolReader::WalkSchema(
     if (e->num_children > 0) {
       for (int i = 0; i < e->num_children; i++) {
         e->children_idx.push_back(idx);
-        int idx_old = idx;
-        idx         = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
-        if (idx <= idx_old) break;  // Error
+        int const idx_old = idx;
+        idx               = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
+        if (idx <= idx_old) { break; }  // Error
       }
     }
     return idx;
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 62ccacaac37..619815db503 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -22,6 +22,7 @@
 #include <cstddef>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -40,9 +41,6 @@ namespace parquet {
  * compression codecs are supported yet.
  */
 class CompactProtocolReader {
- protected:
-  static const uint8_t g_list2struct[16];
-
  public:
   explicit CompactProtocolReader(uint8_t const* base = nullptr, size_t len = 0) { init(base, len); }
   void init(uint8_t const* base, size_t len)
@@ -57,45 +55,46 @@ class CompactProtocolReader {
     bytecnt = std::min(bytecnt, (size_t)(m_end - m_cur));
     m_cur += bytecnt;
   }
-  uint32_t get_u32() noexcept
+
+  // returns a varint encoded integer
+  template <typename T>
+  T get_varint() noexcept
   {
-    uint32_t v = 0;
+    T v = 0;
     for (uint32_t l = 0;; l += 7) {
-      uint32_t c = getb();
+      T c = getb();
       v |= (c & 0x7f) << l;
-      if (c < 0x80) break;
+      if (c < 0x80) { break; }
     }
     return v;
   }
-  uint64_t get_u64() noexcept
-  {
-    uint64_t v = 0;
-    for (uint64_t l = 0;; l += 7) {
-      uint64_t c = getb();
-      v |= (c & 0x7f) << l;
-      if (c < 0x80) break;
-    }
-    return v;
-  }
-  int32_t get_i16() noexcept { return get_i32(); }
-  int32_t get_i32() noexcept
-  {
-    uint32_t u = get_u32();
-    return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1));
-  }
-  int64_t get_i64() noexcept
+
+  // returns a zigzag encoded signed integer
+  template <typename T>
+  T get_zigzag() noexcept
   {
-    uint64_t u = get_u64();
-    return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1));
+    using U   = std::make_unsigned_t<T>;
+    U const u = get_varint<U>();
+    return static_cast<T>((u >> 1u) ^ -static_cast<T>(u & 1));
   }
-  int32_t get_listh(uint8_t* el_type) noexcept
+
+  // thrift spec says to use zigzag i32 for i16 types
+  int32_t get_i16() noexcept { return get_zigzag<int32_t>(); }
+  int32_t get_i32() noexcept { return get_zigzag<int32_t>(); }
+  int64_t get_i64() noexcept { return get_zigzag<int64_t>(); }
+
+  uint32_t get_u32() noexcept { return get_varint<uint32_t>(); }
+  uint64_t get_u64() noexcept { return get_varint<uint64_t>(); }
+
+  [[nodiscard]] std::pair<uint8_t, uint32_t> get_listh() noexcept
   {
-    uint32_t c = getb();
-    int32_t sz = c >> 4;
-    *el_type   = c & 0xf;
-    if (sz == 0xf) sz = get_u32();
-    return sz;
+    uint32_t const c = getb();
+    uint32_t sz      = c >> 4;
+    uint8_t t        = c & 0xf;
+    if (sz == 0xf) { sz = get_u32(); }
+    return {t, sz};
   }
+
   bool skip_struct_field(int t, int depth = 0);
 
  public:
@@ -120,6 +119,7 @@ class CompactProtocolReader {
   bool read(OffsetIndex* o);
   bool read(ColumnIndex* c);
   bool read(Statistics* s);
+  bool read(ColumnOrder* c);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
@@ -140,523 +140,11 @@ class CompactProtocolReader {
   uint8_t const* m_cur  = nullptr;
   uint8_t const* m_end  = nullptr;
 
-  friend class ParquetFieldBool;
-  friend class ParquetFieldBoolList;
-  friend class ParquetFieldInt8;
-  friend class ParquetFieldInt32;
-  friend class ParquetFieldOptionalInt32;
-  friend class ParquetFieldInt64;
-  friend class ParquetFieldInt64List;
-  template <typename T>
-  friend class ParquetFieldStructListFunctor;
-  friend class ParquetFieldString;
-  template <typename T>
-  friend class ParquetFieldStructFunctor;
-  template <typename T, bool>
-  friend class ParquetFieldUnionFunctor;
-  template <typename T>
-  friend class ParquetFieldEnum;
-  template <typename T>
-  friend class ParquetFieldEnumListFunctor;
-  friend class ParquetFieldStringList;
-  friend class ParquetFieldBinary;
-  friend class ParquetFieldBinaryList;
-  friend class ParquetFieldStructBlob;
-};
-
-/**
- * @brief Functor to set value to bool read from CompactProtocolReader
- *
- * @return True if field type is not bool
- */
-class ParquetFieldBool {
-  int field_val;
-  bool& val;
-
- public:
-  ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) ||
-           !(val = (field_type == ST_FLD_TRUE), true);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of booleans from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * bool fails
- */
-class ParquetFieldBoolList {
-  int field_val;
-  std::vector<bool>& val;
-
- public:
-  ParquetFieldBoolList(int f, std::vector<bool>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_TRUE) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      unsigned int current_byte = cpr->getb();
-      if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) return true;
-      val[i] = current_byte == ST_FLD_TRUE;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 8 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int8
- */
-class ParquetFieldInt8 {
-  int field_val;
-  int8_t& val;
-
- public:
-  ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->getb();
-    return (field_type != ST_FLD_BYTE);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 32 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-class ParquetFieldInt32 {
-  int field_val;
-  int32_t& val;
-
- public:
-  ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i32();
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-class ParquetFieldOptionalInt32 {
-  int field_val;
-  std::optional<int32_t>& val;
-
- public:
-  ParquetFieldOptionalInt32(int f, std::optional<int32_t>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i32();
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 64 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32 or int64
- */
-class ParquetFieldInt64 {
-  int field_val;
-  int64_t& val;
-
- public:
-  ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i64();
-    return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of 64-bit integers from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading an
- * int64 fails
- */
-class ParquetFieldInt64List {
-  int field_val;
-  std::vector<int64_t>& val;
-
- public:
-  ParquetFieldInt64List(int f, std::vector<int64_t>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_I64) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      val[i] = cpr->get_i64();
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of structures from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * struct fails
- */
-template <typename T>
-class ParquetFieldStructListFunctor {
-  int field_val;
-  std::vector<T>& val;
-
- public:
-  ParquetFieldStructListFunctor(int f, std::vector<T>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-
-    int current_byte = cpr->getb();
-    if ((current_byte & 0xf) != ST_FLD_STRUCT) return true;
-    int n = current_byte >> 4;
-    if (n == 0xf) n = cpr->get_u32();
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      if (!(cpr->read(&val[i]))) { return true; }
-    }
-
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldStructListFunctor<T> ParquetFieldStructList(int f, std::vector<T>& v)
-{
-  return ParquetFieldStructListFunctor<T>(f, v);
-}
-
-/**
- * @brief Functor to read a string from CompactProtocolReader
- *
- * @return True if field type mismatches or if size of string exceeds bounds
- * of the CompactProtocolReader
- */
-class ParquetFieldString {
-  int field_val;
-  std::string& val;
-
- public:
-  ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_BINARY) return true;
-    uint32_t n = cpr->get_u32();
-    if (n < (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.assign((char const*)cpr->m_cur, n);
-      cpr->m_cur += n;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a structure from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * struct fails
- */
-template <typename T>
-class ParquetFieldStructFunctor {
-  int field_val;
-  T& val;
-
- public:
-  ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    return (field_type != ST_FLD_STRUCT || !(cpr->read(&val)));
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldStructFunctor<T> ParquetFieldStruct(int f, T& v)
-{
-  return ParquetFieldStructFunctor<T>(f, v);
-}
-
-/**
- * @brief Functor to read a union member from CompactProtocolReader
- *
- * @tparam is_empty True if tparam `T` type is empty type, else false.
- *
- * @return True if field types mismatch or if the process of reading a
- * union member fails
- */
-template <typename T, bool is_empty = false>
-class ParquetFieldUnionFunctor {
-  int field_val;
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      return !cpr->read(&val);
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-struct ParquetFieldUnionFunctor<T, true> {
-  int field_val;
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      cpr->skip_struct_field(field_type);
-      return false;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
-{
-  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
-}
-
-/**
- * @brief Functor to set value to enum read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-template <typename Enum>
-class ParquetFieldEnum {
-  int field_val;
-  Enum& val;
-
- public:
-  ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = static_cast<Enum>(cpr->get_i32());
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of enums from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading an
- * enum fails
- */
-template <typename Enum>
-class ParquetFieldEnumListFunctor {
-  int field_val;
-  std::vector<Enum>& val;
-
- public:
-  ParquetFieldEnumListFunctor(int f, std::vector<Enum>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    int current_byte = cpr->getb();
-    if ((current_byte & 0xf) != ST_FLD_I32) return true;
-    int n = current_byte >> 4;
-    if (n == 0xf) n = cpr->get_u32();
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      val[i] = static_cast<Enum>(cpr->get_i32());
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldEnumListFunctor<T> ParquetFieldEnumList(int field, std::vector<T>& v)
-{
-  return ParquetFieldEnumListFunctor<T>(field, v);
-}
-
-/**
- * @brief Functor to read a vector of strings from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * string fails
- */
-class ParquetFieldStringList {
-  int field_val;
-  std::vector<std::string>& val;
-
- public:
-  ParquetFieldStringList(int f, std::vector<std::string>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_BINARY) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      uint32_t l = cpr->get_u32();
-      if (l < (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].assign((char const*)cpr->m_cur, l);
-        cpr->m_cur += l;
-      } else
-        return true;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a binary from CompactProtocolReader
- *
- * @return True if field type mismatches or if size of binary exceeds bounds
- * of the CompactProtocolReader
- */
-class ParquetFieldBinary {
-  int field_val;
-  std::vector<uint8_t>& val;
-
- public:
-  ParquetFieldBinary(int f, std::vector<uint8_t>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_BINARY) return true;
-    uint32_t n = cpr->get_u32();
-    if (n <= (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.resize(n);
-      val.assign(cpr->m_cur, cpr->m_cur + n);
-      cpr->m_cur += n;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of binaries from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * binary fails
- */
-class ParquetFieldBinaryList {
-  int field_val;
-  std::vector<std::vector<uint8_t>>& val;
-
- public:
-  ParquetFieldBinaryList(int f, std::vector<std::vector<uint8_t>>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_BINARY) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      uint32_t l = cpr->get_u32();
-      if (l <= (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].resize(l);
-        val[i].assign(cpr->m_cur, cpr->m_cur + l);
-        cpr->m_cur += l;
-      } else
-        return true;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a struct from CompactProtocolReader
- *
- * @return True if field type mismatches
- */
-class ParquetFieldStructBlob {
-  int field_val;
-  std::vector<uint8_t>& val;
-
- public:
-  ParquetFieldStructBlob(int f, std::vector<uint8_t>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) return true;
-    uint8_t const* start = cpr->m_cur;
-    cpr->skip_struct_field(field_type);
-    if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
-    return false;
-  }
-
-  int field() { return field_val; }
+  friend class parquet_field_string;
+  friend class parquet_field_string_list;
+  friend class parquet_field_binary;
+  friend class parquet_field_binary_list;
+  friend class parquet_field_struct_blob;
 };
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index b2c0c97c52d..60bc8984d81 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -33,18 +33,7 @@ size_t CompactProtocolWriter::write(FileMetaData const& f)
   c.field_struct_list(4, f.row_groups);
   if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); }
   if (not f.created_by.empty()) { c.field_string(6, f.created_by); }
-  if (f.column_order_listsize != 0) {
-    // Dummy list of struct containing an empty field1 struct
-    c.put_field_header(7, c.current_field(), ST_FLD_LIST);
-    c.put_byte((uint8_t)((std::min(f.column_order_listsize, 0xfu) << 4) | ST_FLD_STRUCT));
-    if (f.column_order_listsize >= 0xf) c.put_uint(f.column_order_listsize);
-    for (uint32_t i = 0; i < f.column_order_listsize; i++) {
-      c.put_field_header(1, 0, ST_FLD_STRUCT);
-      c.put_byte(0);  // ColumnOrder.field1 struct end
-      c.put_byte(0);  // ColumnOrder struct end
-    }
-    c.set_current_field(7);
-  }
+  if (f.column_orders.has_value()) { c.field_struct_list(7, f.column_orders.value()); }
   return c.value();
 }
 
@@ -233,6 +222,16 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(ColumnOrder const& co)
+{
+  CompactProtocolFieldWriter c(*this);
+  switch (co) {
+    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break;
+    default: break;
+  }
+  return c.value();
+}
+
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
 void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
@@ -320,6 +319,13 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val)
   current_field_value = field;
 }
 
+inline void CompactProtocolFieldWriter::field_empty_struct(int field)
+{
+  put_field_header(field, current_field_value, ST_FLD_STRUCT);
+  put_byte(0);  // add a stop field
+  current_field_value = field;
+}
+
 template <typename T>
 inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector<T> const& val)
 {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 8d7b0961934..26d66527aa5 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(Statistics const&);
   size_t write(PageLocation const&);
   size_t write(OffsetIndex const&);
+  size_t write(ColumnOrder const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -94,6 +95,8 @@ class CompactProtocolFieldWriter {
   template <typename T>
   inline void field_struct(int field, T const& val);
 
+  inline void field_empty_struct(int field);
+
   template <typename T>
   inline void field_struct_list(int field, std::vector<T> const& val);
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index f7318bb9935..c2affc774c2 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <thrust/optional.h>
+
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -118,6 +120,16 @@ struct LogicalType {
   BsonType BSON;
 };
 
+/**
+ * Union to specify the order used for the min_value and max_value fields for a column.
+ */
+struct ColumnOrder {
+  enum Type { UNDEFINED, TYPE_ORDER };
+  Type type;
+
+  operator Type() const { return type; }
+};
+
 /**
  * @brief Struct for describing an element/field in the Parquet format schema
  *
@@ -135,7 +147,7 @@ struct SchemaElement {
   int32_t num_children                = 0;
   int32_t decimal_scale               = 0;
   int32_t decimal_precision           = 0;
-  std::optional<int32_t> field_id     = std::nullopt;
+  thrust::optional<int32_t> field_id  = thrust::nullopt;
   bool output_as_byte_array           = false;
 
   // The following fields are filled in later during schema initialization
@@ -284,8 +296,8 @@ struct FileMetaData {
   int64_t num_rows = 0;
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
-  std::string created_by         = "";
-  uint32_t column_order_listsize = 0;
+  std::string created_by = "";
+  thrust::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5f8f1617cb9..5a1716bb547 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -141,7 +141,7 @@ enum BoundaryOrder {
 /**
  * @brief Thrift compact protocol struct field types
  */
-enum {
+enum FieldType {
   ST_FLD_TRUE   = 1,
   ST_FLD_FALSE  = 2,
   ST_FLD_BYTE   = 3,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index d2976a3f5d9..a124f352ee4 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -74,8 +74,11 @@ struct aggregate_writer_metadata {
     for (size_t i = 0; i < partitions.size(); ++i) {
       this->files[i].num_rows = partitions[i].num_rows;
     }
-    this->column_order_listsize =
-      (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
+
+    if (stats_granularity != statistics_freq::STATISTICS_NONE) {
+      ColumnOrder default_order = {ColumnOrder::TYPE_ORDER};
+      this->column_orders       = std::vector<ColumnOrder>(num_columns, default_order);
+    }
 
     for (size_t p = 0; p < kv_md.size(); ++p) {
       std::transform(kv_md[p].begin(),
@@ -102,13 +105,13 @@ struct aggregate_writer_metadata {
   {
     CUDF_EXPECTS(part < files.size(), "Invalid part index queried");
     FileMetaData meta{};
-    meta.version               = this->version;
-    meta.schema                = this->schema;
-    meta.num_rows              = this->files[part].num_rows;
-    meta.row_groups            = this->files[part].row_groups;
-    meta.key_value_metadata    = this->files[part].key_value_metadata;
-    meta.created_by            = this->created_by;
-    meta.column_order_listsize = this->column_order_listsize;
+    meta.version            = this->version;
+    meta.schema             = this->schema;
+    meta.num_rows           = this->files[part].num_rows;
+    meta.row_groups         = this->files[part].row_groups;
+    meta.key_value_metadata = this->files[part].key_value_metadata;
+    meta.created_by         = this->created_by;
+    meta.column_orders      = this->column_orders;
     return meta;
   }
 
@@ -170,8 +173,8 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  std::string created_by         = "";
-  uint32_t column_order_listsize = 0;
+  std::string created_by                                   = "";
+  thrust::optional<std::vector<ColumnOrder>> column_orders = thrust::nullopt;
 };
 
 namespace {
@@ -2373,20 +2376,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
       md.num_rows += tmp.num_rows;
     }
   }
-  // Reader doesn't currently populate column_order, so infer it here
-  if (not md.row_groups.empty()) {
-    auto const is_valid_stats = [](auto const& stats) {
-      return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 ||
-             stats.distinct_count != -1 || not stats.max_value.empty() ||
-             not stats.min_value.empty();
-    };
 
-    uint32_t num_columns = static_cast<uint32_t>(md.row_groups[0].columns.size());
-    md.column_order_listsize =
-      (num_columns > 0 && is_valid_stats(md.row_groups[0].columns[0].meta_data.statistics))
-        ? num_columns
-        : 0;
-  }
   // Thrift-encode the resulting output
   file_header_s fhdr;
   file_ender_s fendr;

From e87d2fc1df6105d802b300bad19a9937f8155613 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 20 Sep 2023 21:18:31 +0100
Subject: [PATCH 21/23] Reduce memory usage of as_categorical_column (#14138)

The main culprit is in the way the codes returned from _label_encoding were being ordered. We were generating an int64 column for the order, gathering through the left gather map, and then argsorting, before using that ordering as a gather map for the codes.

We note that gather(y, with=argsort(x)) is equivalent to sort_by_key(y, with=x) so use that instead (avoiding an unnecessary gather). Furthermore we also note that gather([0..n), with=x) is just equivalent to x, so we can avoid a gather too.

This reduces the peak memory footprint of categorifying a random column of 500_000_000 int32 values where there are 100 unique values from 24.75 GiB to 11.67 GiB.

### Test code

```python
import cudf
import cupy as cp

K = 100
N = 500_000_000
rng = cp.random._generator.RandomState()
column = cudf.core.column.as_column(rng.choice(cp.arange(K, dtype="int32"), size=(N,), replace=True))
column = column.astype("category", ordered=False)
```

### Before

![Screenshot from 2023-09-20 14-49-27](https://github.com/rapidsai/cudf/assets/1126981/08782501-c233-4efd-b4d6-a378cea82a82)

### After

![Screenshot from 2023-09-20 14-49-42](https://github.com/rapidsai/cudf/assets/1126981/93193bfb-a93e-45bf-8e5a-24289efc77c4)

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14138
---
 python/cudf/cudf/core/column/column.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d2e2f11a12e..0bc50a521e2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1390,20 +1390,19 @@ def _return_sentinel_column():
         except ValueError:
             return _return_sentinel_column()
 
-        codes = arange(len(cats), dtype=dtype)
         left_gather_map, right_gather_map = cpp_join(
             [self], [cats], how="left"
         )
-        codes = codes.take(
-            right_gather_map, nullify=True, check_bounds=False
-        ).fillna(na_sentinel.value)
-
+        codes = libcudf.copying.gather(
+            [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True
+        )
+        del right_gather_map
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        order = arange(len(self))
-        order = order.take(left_gather_map, check_bounds=False).argsort()
-        codes = codes.take(order)
-        return codes
+        (codes,) = libcudf.sort.sort_by_key(
+            codes, [left_gather_map], [True], ["last"], stable=True
+        )
+        return codes.fillna(na_sentinel.value)
 
 
 def column_empty_like(

From fe99e4baa3a7cd0f87658bf1ea77b17ec61fd7dc Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:42:32 -0400
Subject: [PATCH 22/23] Expose stream parameter in public strings find APIs
 (#14060)

Add stream parameter to public APIs:

- `cudf::strings::find()`
- `cudf::strings::rfind()`
- `cudf::strings::contains()`
- `cudf::strings::starts_with()`
- `cudf::strings::ends_with()`
- `cudf::strings::findall()`
- `cudf::strings::find_multiple()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14060
---
 cpp/include/cudf/strings/find.hpp          | 102 ++++++++++++---------
 cpp/include/cudf/strings/find_multiple.hpp |  12 ++-
 cpp/include/cudf/strings/findall.hpp       |   2 +
 cpp/src/strings/search/find.cu             |  24 +++--
 cpp/src/strings/search/find_multiple.cu    |   7 +-
 cpp/src/strings/search/findall.cu          |   3 +-
 cpp/tests/CMakeLists.txt                   |   5 +-
 cpp/tests/streams/strings/find_test.cpp    |  49 ++++++++++
 8 files changed, 143 insertions(+), 61 deletions(-)
 create mode 100644 cpp/tests/streams/strings/find_test.cpp

diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index 2fed36862b9..c1aa8b294b3 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -43,19 +43,21 @@ namespace strings {
  *
  * @throw cudf::logic_error if start position is greater than stop position.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param start First character position to include in the search.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param start First character position to include in the search
  * @param stop Last position (exclusive) to include in the search.
  *             Default of -1 will search to the end of the string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
  */
 std::unique_ptr<column> find(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -72,19 +74,21 @@ std::unique_ptr<column> find(
  *
  * @throw cudf::logic_error if start position is greater than stop position.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param start First position to include in the search.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param start First position to include in the search
  * @param stop Last position (exclusive) to include in the search.
  *             Default of -1 will search starting at the end of the string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
  */
 std::unique_ptr<column> rfind(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -123,37 +127,41 @@ std::unique_ptr<column> find(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> contains(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the corresponding target string was found within that string in the provided column.
  *
- * The 'output[i] = true` if string `targets[i]` is found inside `strings[i]` otherwise
+ * The 'output[i] = true` if string `targets[i]` is found inside `input[i]` otherwise
  * `output[i] = false`.
  * If `target[i]` is an empty string, true is returned for `output[i]`.
  * If `target[i]` is null, false is returned for `output[i]`.
  *
- * Any null `strings[i]` row results in a null `output[i]` row.
+ * Any null string entries return corresponding null entries in the output columns.
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings column of targets to check row-wise in `strings`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings column of targets to check row-wise in `strings`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> contains(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -166,14 +174,16 @@ std::unique_ptr<column> contains(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New type_id::BOOL8 column.
  */
 std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -190,14 +200,16 @@ std::unique_ptr<column> starts_with(
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -210,14 +222,16 @@ std::unique_ptr<column> starts_with(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -234,14 +248,16 @@ std::unique_ptr<column> ends_with(
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 21cfdb15146..06b851c5012 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if `targets` is empty or contains nulls
  *
- * @param input Strings instance for this operation.
- * @param targets Strings to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Lists column with character position values.
+ * @param input Strings instance for this operation
+ * @param targets Strings to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column with character position values
  */
 std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 745f0fc19ff..379b9624dc6 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -57,12 +57,14 @@ struct regex_program;
  *
  * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column of strings
  */
 std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 3de9dd34d83..1299e552565 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -305,20 +305,22 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              string_scalar const& target,
                              size_type start,
                              size_type stop,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr);
+  return detail::find(strings, target, start, stop, stream, mr);
 }
 
 std::unique_ptr<column> rfind(strings_column_view const& strings,
                               string_scalar const& target,
                               size_type start,
                               size_type stop,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr);
+  return detail::rfind(strings, target, start, stop, stream, mr);
 }
 
 std::unique_ptr<column> find(strings_column_view const& input,
@@ -618,50 +620,56 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  string_scalar const& target,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, target, cudf::get_default_stream(), mr);
+  return detail::contains(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, targets, cudf::get_default_stream(), mr);
+  return detail::contains(strings, targets, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, target, cudf::get_default_stream(), mr);
+  return detail::starts_with(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, targets, cudf::get_default_stream(), mr);
+  return detail::starts_with(strings, targets, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, target, cudf::get_default_stream(), mr);
+  return detail::ends_with(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, targets, cudf::get_default_stream(), mr);
+  return detail::ends_with(strings, targets, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 4a823ad1dcb..fcaec835f4d 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -70,8 +70,8 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
   results->set_null_count(0);
 
   auto offsets = cudf::detail::sequence(strings_count + 1,
-                                        numeric_scalar<size_type>(0),
-                                        numeric_scalar<size_type>(targets_count),
+                                        numeric_scalar<size_type>(0, true, stream),
+                                        numeric_scalar<size_type>(targets_count, true, stream),
                                         stream,
                                         mr);
   return make_lists_column(strings_count,
@@ -88,10 +88,11 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 // external API
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(input, targets, cudf::get_default_stream(), mr);
+  return detail::find_multiple(input, targets, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 2df64c6a0a7..acea4ff1c51 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -134,10 +134,11 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall(input, prog, cudf::get_default_stream(), mr);
+  return detail::findall(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4923ef5c903..6414962903e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -627,7 +627,10 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing)
+ConfigureTest(
+  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
+  testing
+)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp
new file mode 100644
index 00000000000..b734a1738cc
--- /dev/null
+++ b/cpp/tests/streams/strings/find_test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/findall.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+
+class StringsFindTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsFindTest, Find)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  cudf::strings::find(view, target, 0, -1, cudf::test::get_default_stream());
+  cudf::strings::rfind(view, target, 0, -1, cudf::test::get_default_stream());
+  cudf::strings::find(view, view, 0, cudf::test::get_default_stream());
+  cudf::strings::find_multiple(view, view, cudf::test::get_default_stream());
+  cudf::strings::contains(view, target, cudf::test::get_default_stream());
+  cudf::strings::starts_with(view, target, cudf::test::get_default_stream());
+  cudf::strings::starts_with(view, view, cudf::test::get_default_stream());
+  cudf::strings::ends_with(view, target, cudf::test::get_default_stream());
+  cudf::strings::ends_with(view, view, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::findall(view, *prog, cudf::test::get_default_stream());
+}

From 05ee2604d8f4e7c6525d12926100e2b11b6d6cb0 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:45:11 -0400
Subject: [PATCH 23/23] Fix kernel launch error for
 cudf::io::orc::gpu::rowgroup_char_counts_kernel (#14139)

Fixes memcheck error found during the nightly builds found in gtest `OrcWriterNumericTypeTest/0.SingleColumn`

```
# compute-sanitizer --tool memcheck gtests/ORC_TEST --gtest_filter=OrcWriterNumericTypeTest/0.SingleColumn --rmm_mode=cuda
========= COMPUTE-SANITIZER
Note: Google Test filter = OrcWriterNumericTypeTest/0.SingleColumn
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from OrcWriterNumericTypeTest/0, where TypeParam = signed char
[ RUN      ] OrcWriterNumericTypeTest/0.SingleColumn
========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaLaunchKernel.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x480aa6]
=========                in /usr/lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaLaunchKernel [0x6c358]
=========                in /conda/envs/rapids/lib/libcudart.so.11.0
=========     Host Frame:__device_stub__ZN4cudf2io3orc3gpu27rowgroup_char_counts_kernelENS_6detail11base_2dspanIiNS_11device_spanEEENS5_IKNS1_22orc_column_device_viewELm18446744073709551615EEENS4_IKNS1_13rowgroup_rowsES5_EENS5_IKjLm18446744073709551615EEE(cudf::detail::base_2dspan<int, cudf::device_span>&, cudf::device_span<cudf::io::orc::orc_column_device_view const, 18446744073709551615ul>&, cudf::detail::base_2dspan<cudf::io::orc::rowgroup_rows const, cudf::device_span>&, cudf::device_span<unsigned int const, 18446744073709551615ul>&) [0x14fccb4]

```

Adds a check to avoid the kernel launch if the number of strings column is zero.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14139
---
 cpp/src/io/orc/dict_enc.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 0007530a5af..1d2262a1ccc 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -60,6 +60,7 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
 
   auto const num_rowgroups = rowgroup_bounds.size().first;
   auto const num_str_cols  = str_col_indexes.size();
+  if (num_str_cols == 0) { return; }
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required